md_parser.py 1019 B

123456789101112131415161718192021222324252627282930313233343536373839
  1. # type: ignore
  2. from typing import AsyncGenerator
  3. from bs4 import BeautifulSoup
  4. from core.base.parsers.base_parser import AsyncParser
  5. from core.base.providers import (
  6. CompletionProvider,
  7. DatabaseProvider,
  8. IngestionConfig,
  9. )
  10. class MDParser(AsyncParser[str | bytes]):
  11. """A parser for Markdown data."""
  12. def __init__(
  13. self,
  14. config: IngestionConfig,
  15. database_provider: DatabaseProvider,
  16. llm_provider: CompletionProvider,
  17. ):
  18. self.database_provider = database_provider
  19. self.llm_provider = llm_provider
  20. self.config = config
  21. import markdown
  22. self.markdown = markdown
  23. async def ingest(
  24. self, data: str | bytes, *args, **kwargs
  25. ) -> AsyncGenerator[str, None]:
  26. """Ingest Markdown data and yield text."""
  27. if isinstance(data, bytes):
  28. data = data.decode("utf-8")
  29. html = self.markdown.markdown(data)
  30. soup = BeautifulSoup(html, "html.parser")
  31. yield soup.get_text()