123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- # type: ignore
- from typing import AsyncGenerator
- from striprtf.striprtf import rtf_to_text
- from core.base.parsers.base_parser import AsyncParser
- from core.base.providers import (
- CompletionProvider,
- DatabaseProvider,
- IngestionConfig,
- )
- class RTFParser(AsyncParser[str | bytes]):
- """Parser for Rich Text Format (.rtf) files."""
- def __init__(
- self,
- config: IngestionConfig,
- database_provider: DatabaseProvider,
- llm_provider: CompletionProvider,
- ):
- self.database_provider = database_provider
- self.llm_provider = llm_provider
- self.config = config
- self.striprtf = rtf_to_text
- async def ingest(
- self, data: str | bytes, **kwargs
- ) -> AsyncGenerator[str, None]:
- if isinstance(data, bytes):
- data = data.decode("utf-8", errors="ignore")
- try:
- # Convert RTF to plain text
- plain_text = self.striprtf(data)
- # Split into paragraphs and yield non-empty ones
- paragraphs = plain_text.split("\n\n")
- for paragraph in paragraphs:
- if paragraph.strip():
- yield paragraph.strip()
- except Exception as e:
- raise ValueError(f"Error processing RTF file: {str(e)}") from e
|