rst_parser.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. # type: ignore
  2. from typing import AsyncGenerator
  3. from core.base.parsers.base_parser import AsyncParser
  4. from core.base.providers import (
  5. CompletionProvider,
  6. DatabaseProvider,
  7. IngestionConfig,
  8. )
  9. class RSTParser(AsyncParser[str | bytes]):
  10. """Parser for reStructuredText (.rst) files."""
  11. def __init__(
  12. self,
  13. config: IngestionConfig,
  14. database_provider: DatabaseProvider,
  15. llm_provider: CompletionProvider,
  16. ):
  17. self.database_provider = database_provider
  18. self.llm_provider = llm_provider
  19. self.config = config
  20. try:
  21. from docutils.core import publish_string
  22. from docutils.writers import html5_polyglot
  23. self.publish_string = publish_string
  24. self.html5_polyglot = html5_polyglot
  25. except ImportError:
  26. raise ImportError(
  27. "Error: 'docutils' is required to run RSTParser. "
  28. "Please install it using pip: pip install docutils"
  29. )
  30. async def ingest(
  31. self, data: str | bytes, **kwargs
  32. ) -> AsyncGenerator[str, None]:
  33. if isinstance(data, bytes):
  34. data = data.decode("utf-8")
  35. try:
  36. # Convert RST to HTML
  37. html = self.publish_string(
  38. source=data,
  39. writer=self.html5_polyglot.Writer(),
  40. settings_overrides={"report_level": 5},
  41. )
  42. # Basic HTML cleanup
  43. import re
  44. text = html.decode("utf-8")
  45. text = re.sub(r"<[^>]+>", " ", text)
  46. text = re.sub(r"\s+", " ", text)
  47. # Split into paragraphs and yield non-empty ones
  48. paragraphs = text.split("\n\n")
  49. for paragraph in paragraphs:
  50. if paragraph.strip():
  51. yield paragraph.strip()
  52. except Exception as e:
  53. raise ValueError(f"Error processing RST file: {str(e)}")