pptx_parser.py 1.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940
  1. # type: ignore
  2. from io import BytesIO
  3. from typing import AsyncGenerator
  4. from pptx import Presentation
  5. from core.base.parsers.base_parser import AsyncParser
  6. from core.base.providers import (
  7. CompletionProvider,
  8. DatabaseProvider,
  9. IngestionConfig,
  10. )
  11. class PPTXParser(AsyncParser[str | bytes]):
  12. """A parser for PPT data."""
  13. def __init__(
  14. self,
  15. config: IngestionConfig,
  16. database_provider: DatabaseProvider,
  17. llm_provider: CompletionProvider,
  18. ):
  19. self.database_provider = database_provider
  20. self.llm_provider = llm_provider
  21. self.config = config
  22. self.Presentation = Presentation
  23. async def ingest(
  24. self, data: str | bytes, **kwargs
  25. ) -> AsyncGenerator[str, None]: # type: ignore
  26. """Ingest PPT data and yield text from each slide."""
  27. if isinstance(data, str):
  28. raise ValueError("PPT data must be in bytes format.")
  29. prs = self.Presentation(BytesIO(data))
  30. for slide in prs.slides:
  31. for shape in slide.shapes:
  32. if hasattr(shape, "text"):
  33. yield shape.text