rtf_parser.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. # type: ignore
  2. from typing import AsyncGenerator
  3. from striprtf.striprtf import rtf_to_text
  4. from core.base.parsers.base_parser import AsyncParser
  5. from core.base.providers import (
  6. CompletionProvider,
  7. DatabaseProvider,
  8. IngestionConfig,
  9. )
  10. class RTFParser(AsyncParser[str | bytes]):
  11. """Parser for Rich Text Format (.rtf) files."""
  12. def __init__(
  13. self,
  14. config: IngestionConfig,
  15. database_provider: DatabaseProvider,
  16. llm_provider: CompletionProvider,
  17. ):
  18. self.database_provider = database_provider
  19. self.llm_provider = llm_provider
  20. self.config = config
  21. self.striprtf = rtf_to_text
  22. async def ingest(
  23. self, data: str | bytes, **kwargs
  24. ) -> AsyncGenerator[str, None]:
  25. if isinstance(data, bytes):
  26. data = data.decode("utf-8", errors="ignore")
  27. try:
  28. # Convert RTF to plain text
  29. plain_text = self.striprtf(data)
  30. # Split into paragraphs and yield non-empty ones
  31. paragraphs = plain_text.split("\n\n")
  32. for paragraph in paragraphs:
  33. if paragraph.strip():
  34. yield paragraph.strip()
  35. except Exception as e:
  36. raise ValueError(f"Error processing RTF file: {str(e)}") from e