rtf_parser.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152
  1. # type: ignore
  2. from typing import AsyncGenerator
  3. from core.base.parsers.base_parser import AsyncParser
  4. from core.base.providers import (
  5. CompletionProvider,
  6. DatabaseProvider,
  7. IngestionConfig,
  8. )
  9. class RTFParser(AsyncParser[str | bytes]):
  10. """Parser for Rich Text Format (.rtf) files."""
  11. def __init__(
  12. self,
  13. config: IngestionConfig,
  14. database_provider: DatabaseProvider,
  15. llm_provider: CompletionProvider,
  16. ):
  17. self.database_provider = database_provider
  18. self.llm_provider = llm_provider
  19. self.config = config
  20. try:
  21. from striprtf.striprtf import rtf_to_text
  22. self.striprtf = rtf_to_text
  23. except ImportError:
  24. raise ImportError(
  25. "Error: 'striprtf' is required to run RTFParser. "
  26. "Please install it using pip: pip install striprtf"
  27. )
  28. async def ingest(
  29. self, data: str | bytes, **kwargs
  30. ) -> AsyncGenerator[str, None]:
  31. if isinstance(data, bytes):
  32. data = data.decode("utf-8", errors="ignore")
  33. try:
  34. # Convert RTF to plain text
  35. plain_text = self.striprtf(data)
  36. # Split into paragraphs and yield non-empty ones
  37. paragraphs = plain_text.split("\n\n")
  38. for paragraph in paragraphs:
  39. if paragraph.strip():
  40. yield paragraph.strip()
  41. except Exception as e:
  42. raise ValueError(f"Error processing RTF file: {str(e)}")