ppt_parser.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. # type: ignore
  2. import struct
  3. from io import BytesIO
  4. from typing import AsyncGenerator
  5. import olefile
  6. from core.base.parsers.base_parser import AsyncParser
  7. from core.base.providers import (
  8. CompletionProvider,
  9. DatabaseProvider,
  10. IngestionConfig,
  11. )
  12. class PPTParser(AsyncParser[str | bytes]):
  13. """A parser for legacy PPT (PowerPoint 97-2003) files."""
  14. def __init__(
  15. self,
  16. config: IngestionConfig,
  17. database_provider: DatabaseProvider,
  18. llm_provider: CompletionProvider,
  19. ):
  20. self.database_provider = database_provider
  21. self.llm_provider = llm_provider
  22. self.config = config
  23. self.olefile = olefile
  24. def _extract_text_from_record(self, data: bytes) -> str:
  25. """Extract text from a PPT text record."""
  26. try:
  27. # Skip record header
  28. text_data = data[8:]
  29. # Convert from UTF-16-LE
  30. return text_data.decode("utf-16-le", errors="ignore").strip()
  31. except Exception:
  32. return ""
  33. async def ingest(
  34. self, data: str | bytes, **kwargs
  35. ) -> AsyncGenerator[str, None]:
  36. """Ingest PPT data and yield text from each slide."""
  37. if isinstance(data, str):
  38. raise ValueError("PPT data must be in bytes format.")
  39. try:
  40. ole = self.olefile.OleFileIO(BytesIO(data))
  41. # PPT stores text in PowerPoint Document stream
  42. if not ole.exists("PowerPoint Document"):
  43. raise ValueError("Not a valid PowerPoint file")
  44. # Read PowerPoint Document stream
  45. ppt_stream = ole.openstream("PowerPoint Document")
  46. content = ppt_stream.read()
  47. # Text records start with 0x0FA0 or 0x0FD0
  48. text_markers = [b"\xa0\x0f", b"\xd0\x0f"]
  49. current_position = 0
  50. while current_position < len(content):
  51. # Look for text markers
  52. for marker in text_markers:
  53. marker_pos = content.find(marker, current_position)
  54. if marker_pos != -1:
  55. # Get record size from header (4 bytes after marker)
  56. size_bytes = content[marker_pos + 2 : marker_pos + 6]
  57. record_size = struct.unpack("<I", size_bytes)[0]
  58. # Extract record data
  59. record_data = content[
  60. marker_pos : marker_pos + record_size + 8
  61. ]
  62. text = self._extract_text_from_record(record_data)
  63. if text.strip():
  64. yield text.strip()
  65. current_position = marker_pos + record_size + 8
  66. break
  67. else:
  68. current_position += 1
  69. except Exception as e:
  70. raise ValueError(f"Error processing PPT file: {str(e)}") from e
  71. finally:
  72. ole.close()