docx_parser.py 1.0 KB

1234567891011121314151617181920212223242526272829303132333435363738
  1. # type: ignore
  2. from io import BytesIO
  3. from typing import AsyncGenerator
  4. from docx import Document
  5. from core.base.parsers.base_parser import AsyncParser
  6. from core.base.providers import (
  7. CompletionProvider,
  8. DatabaseProvider,
  9. IngestionConfig,
  10. )
  11. class DOCXParser(AsyncParser[str | bytes]):
  12. """A parser for DOCX data."""
  13. def __init__(
  14. self,
  15. config: IngestionConfig,
  16. database_provider: DatabaseProvider,
  17. llm_provider: CompletionProvider,
  18. ):
  19. self.database_provider = database_provider
  20. self.llm_provider = llm_provider
  21. self.config = config
  22. self.Document = Document
  23. async def ingest(
  24. self, data: str | bytes, *args, **kwargs
  25. ) -> AsyncGenerator[str, None]: # type: ignore
  26. """Ingest DOCX data and yield text from each paragraph."""
  27. if isinstance(data, str):
  28. raise ValueError("DOCX data must be in bytes format.")
  29. doc = self.Document(BytesIO(data))
  30. for paragraph in doc.paragraphs:
  31. yield paragraph.text