1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
- # type: ignore
- from typing import AsyncGenerator
- from core.base.parsers.base_parser import AsyncParser
- from core.base.providers import (
- CompletionProvider,
- DatabaseProvider,
- IngestionConfig,
- )
- class RSTParser(AsyncParser[str | bytes]):
- """Parser for reStructuredText (.rst) files."""
- def __init__(
- self,
- config: IngestionConfig,
- database_provider: DatabaseProvider,
- llm_provider: CompletionProvider,
- ):
- self.database_provider = database_provider
- self.llm_provider = llm_provider
- self.config = config
- try:
- from docutils.core import publish_string
- from docutils.writers import html5_polyglot
- self.publish_string = publish_string
- self.html5_polyglot = html5_polyglot
- except ImportError:
- raise ImportError(
- "Error: 'docutils' is required to run RSTParser. "
- "Please install it using pip: pip install docutils"
- )
- async def ingest(
- self, data: str | bytes, **kwargs
- ) -> AsyncGenerator[str, None]:
- if isinstance(data, bytes):
- data = data.decode("utf-8")
- try:
- # Convert RST to HTML
- html = self.publish_string(
- source=data,
- writer=self.html5_polyglot.Writer(),
- settings_overrides={"report_level": 5},
- )
- # Basic HTML cleanup
- import re
- text = html.decode("utf-8")
- text = re.sub(r"<[^>]+>", " ", text)
- text = re.sub(r"\s+", " ", text)
- # Split into paragraphs and yield non-empty ones
- paragraphs = text.split("\n\n")
- for paragraph in paragraphs:
- if paragraph.strip():
- yield paragraph.strip()
- except Exception as e:
- raise ValueError(f"Error processing RST file: {str(e)}")
|