1234567891011121314151617181920212223242526272829303132333435363738394041 |
- from io import BytesIO
- from typing import AsyncGenerator
- from core.base.parsers.base_parser import AsyncParser
- from core.base.providers import (
- CompletionProvider,
- DatabaseProvider,
- IngestionConfig,
- )
- class DOCXParser(AsyncParser[str | bytes]):
- """A parser for DOCX data."""
- def __init__(
- self,
- config: IngestionConfig,
- database_provider: DatabaseProvider,
- llm_provider: CompletionProvider,
- ):
- self.database_provider = database_provider
- self.llm_provider = llm_provider
- self.config = config
- try:
- from docx import Document
- self.Document = Document
- except ImportError:
- raise ValueError(
- "Error, `python-docx` is required to run `DOCXParser`. Please install it using `pip install python-docx`."
- )
- async def ingest(self, data: str | bytes, *args, **kwargs) -> AsyncGenerator[str, None]: # type: ignore
- """Ingest DOCX data and yield text from each paragraph."""
- if isinstance(data, str):
- raise ValueError("DOCX data must be in bytes format.")
- doc = self.Document(BytesIO(data))
- for paragraph in doc.paragraphs:
- yield paragraph.text
|