1234567891011121314151617181920212223242526272829303132 |
- # type: ignore
- from typing import AsyncGenerator
- from bs4 import BeautifulSoup
- from core.base.parsers.base_parser import AsyncParser
- from core.base.providers import (
- CompletionProvider,
- DatabaseProvider,
- IngestionConfig,
- )
- class HTMLParser(AsyncParser[str | bytes]):
- """A parser for HTML data."""
- def __init__(
- self,
- config: IngestionConfig,
- database_provider: DatabaseProvider,
- llm_provider: CompletionProvider,
- ):
- self.database_provider = database_provider
- self.llm_provider = llm_provider
- self.config = config
- async def ingest(
- self, data: str | bytes, *args, **kwargs
- ) -> AsyncGenerator[str, None]:
- """Ingest HTML data and yield text."""
- soup = BeautifulSoup(data, "html.parser")
- yield soup.get_text()
|