docx_parser.py 1.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. from io import BytesIO
  2. from typing import AsyncGenerator
  3. from core.base.parsers.base_parser import AsyncParser
  4. from core.base.providers import (
  5. CompletionProvider,
  6. DatabaseProvider,
  7. IngestionConfig,
  8. )
  9. class DOCXParser(AsyncParser[str | bytes]):
  10. """A parser for DOCX data."""
  11. def __init__(
  12. self,
  13. config: IngestionConfig,
  14. database_provider: DatabaseProvider,
  15. llm_provider: CompletionProvider,
  16. ):
  17. self.database_provider = database_provider
  18. self.llm_provider = llm_provider
  19. self.config = config
  20. try:
  21. from docx import Document
  22. self.Document = Document
  23. except ImportError:
  24. raise ValueError(
  25. "Error, `python-docx` is required to run `DOCXParser`. Please install it using `pip install python-docx`."
  26. )
  27. async def ingest(self, data: str | bytes, *args, **kwargs) -> AsyncGenerator[str, None]: # type: ignore
  28. """Ingest DOCX data and yield text from each paragraph."""
  29. if isinstance(data, str):
  30. raise ValueError("DOCX data must be in bytes format.")
  31. doc = self.Document(BytesIO(data))
  32. for paragraph in doc.paragraphs:
  33. yield paragraph.text