pptx_parser.py 1.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243
  1. # type: ignore
  2. from io import BytesIO
  3. from typing import AsyncGenerator
  4. from core.base.parsers.base_parser import AsyncParser
  5. from core.base.providers import (
  6. CompletionProvider,
  7. DatabaseProvider,
  8. IngestionConfig,
  9. )
  10. class PPTXParser(AsyncParser[str | bytes]):
  11. """A parser for PPT data."""
  12. def __init__(
  13. self,
  14. config: IngestionConfig,
  15. database_provider: DatabaseProvider,
  16. llm_provider: CompletionProvider,
  17. ):
  18. self.database_provider = database_provider
  19. self.llm_provider = llm_provider
  20. self.config = config
  21. try:
  22. from pptx import Presentation
  23. self.Presentation = Presentation
  24. except ImportError:
  25. raise ValueError(
  26. "Error, `python-pptx` is required to run `PPTXParser`. Please install it using `pip install python-pptx`."
  27. )
  28. async def ingest(self, data: str | bytes, **kwargs) -> AsyncGenerator[str, None]: # type: ignore
  29. """Ingest PPT data and yield text from each slide."""
  30. if isinstance(data, str):
  31. raise ValueError("PPT data must be in bytes format.")
  32. prs = self.Presentation(BytesIO(data))
  33. for slide in prs.slides:
  34. for shape in slide.shapes:
  35. if hasattr(shape, "text"):
  36. yield shape.text