ppt_parser.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142
  1. from io import BytesIO
  2. from typing import AsyncGenerator
  3. from core.base.parsers.base_parser import AsyncParser
  4. from core.base.providers import (
  5. CompletionProvider,
  6. DatabaseProvider,
  7. IngestionConfig,
  8. )
  9. class PPTParser(AsyncParser[str | bytes]):
  10. """A parser for PPT data."""
  11. def __init__(
  12. self,
  13. config: IngestionConfig,
  14. database_provider: DatabaseProvider,
  15. llm_provider: CompletionProvider,
  16. ):
  17. self.database_provider = database_provider
  18. self.llm_provider = llm_provider
  19. self.config = config
  20. try:
  21. from pptx import Presentation
  22. self.Presentation = Presentation
  23. except ImportError:
  24. raise ValueError(
  25. "Error, `python-pptx` is required to run `PPTParser`. Please install it using `pip install python-pptx`."
  26. )
  27. async def ingest(self, data: str | bytes, **kwargs) -> AsyncGenerator[str, None]: # type: ignore
  28. """Ingest PPT data and yield text from each slide."""
  29. if isinstance(data, str):
  30. raise ValueError("PPT data must be in bytes format.")
  31. prs = self.Presentation(BytesIO(data))
  32. for slide in prs.slides:
  33. for shape in slide.shapes:
  34. if hasattr(shape, "text"):
  35. yield shape.text