rst_parser.py 1.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758
  1. # type: ignore
  2. from typing import AsyncGenerator
  3. from docutils.core import publish_string
  4. from docutils.writers import html5_polyglot
  5. from core.base.parsers.base_parser import AsyncParser
  6. from core.base.providers import (
  7. CompletionProvider,
  8. DatabaseProvider,
  9. IngestionConfig,
  10. )
  11. class RSTParser(AsyncParser[str | bytes]):
  12. """Parser for reStructuredText (.rst) files."""
  13. def __init__(
  14. self,
  15. config: IngestionConfig,
  16. database_provider: DatabaseProvider,
  17. llm_provider: CompletionProvider,
  18. ):
  19. self.database_provider = database_provider
  20. self.llm_provider = llm_provider
  21. self.config = config
  22. self.publish_string = publish_string
  23. self.html5_polyglot = html5_polyglot
  24. async def ingest(
  25. self, data: str | bytes, **kwargs
  26. ) -> AsyncGenerator[str, None]:
  27. if isinstance(data, bytes):
  28. data = data.decode("utf-8")
  29. try:
  30. # Convert RST to HTML
  31. html = self.publish_string(
  32. source=data,
  33. writer=self.html5_polyglot.Writer(),
  34. settings_overrides={"report_level": 5},
  35. )
  36. # Basic HTML cleanup
  37. import re
  38. text = html.decode("utf-8")
  39. text = re.sub(r"<[^>]+>", " ", text)
  40. text = re.sub(r"\s+", " ", text)
  41. # Split into paragraphs and yield non-empty ones
  42. paragraphs = text.split("\n\n")
  43. for paragraph in paragraphs:
  44. if paragraph.strip():
  45. yield paragraph.strip()
  46. except Exception as e:
  47. raise ValueError(f"Error processing RST file: {str(e)}") from e