jack
/
r2r


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
							# type: ignore
from typing import AsyncGenerator

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
    CompletionProvider,
    DatabaseProvider,
    IngestionConfig,
)


class RSTParser(AsyncParser[str | bytes]):
    """Parser for reStructuredText (.rst) files."""

    def __init__(
        self,
        config: IngestionConfig,
        database_provider: DatabaseProvider,
        llm_provider: CompletionProvider,
    ):
        self.database_provider = database_provider
        self.llm_provider = llm_provider
        self.config = config

        try:
            from docutils.core import publish_string
            from docutils.writers import html5_polyglot

            self.publish_string = publish_string
            self.html5_polyglot = html5_polyglot
        except ImportError:
            raise ImportError(
                "Error: 'docutils' is required to run RSTParser. "
                "Please install it using pip: pip install docutils"
            )

    async def ingest(
        self, data: str | bytes, **kwargs
    ) -> AsyncGenerator[str, None]:
        if isinstance(data, bytes):
            data = data.decode("utf-8")

        try:
            # Convert RST to HTML
            html = self.publish_string(
                source=data,
                writer=self.html5_polyglot.Writer(),
                settings_overrides={"report_level": 5},
            )

            # Basic HTML cleanup
            import re

            text = html.decode("utf-8")
            text = re.sub(r"<[^>]+>", " ", text)
            text = re.sub(r"\s+", " ", text)

            # Split into paragraphs and yield non-empty ones
            paragraphs = text.split("\n\n")
            for paragraph in paragraphs:
                if paragraph.strip():
                    yield paragraph.strip()

        except Exception as e:
            raise ValueError(f"Error processing RST file: {str(e)}")