1234567891011121314151617181920212223242526272829303132333435363738394041424344 |
- from langchain.document_loaders import Blob
- from langchain.document_loaders.parsers import BS4HTMLParser, PyMuPDFParser
- from langchain.document_loaders.parsers.generic import MimeTypeBasedParser
- from langchain.document_loaders.parsers.txt import TextParser
- PARSER_HANDLERS = {
- "application/pdf": PyMuPDFParser(),
- "text/plain": TextParser(),
- "text/html": BS4HTMLParser(),
- }
- MIMETYPE_PARSER = MimeTypeBasedParser(
- handlers=PARSER_HANDLERS,
- fallback_parser=None,
- )
- def _get_mimetype(file_bytes: bytes) -> str:
- try:
- import magic
- except ImportError:
- raise ImportError(
- "magic package not found, please install it with `pip install python-magic` and `brew install libmagic`"
- )
- mime = magic.Magic(mime=True)
- mime_type = mime.from_buffer(file_bytes)
- return mime_type
- def load(data: bytes) -> str:
- mimetype = _get_mimetype(data)
- blob = Blob.from_data(
- data=data,
- mime_type=mimetype,
- )
- parser = MIMETYPE_PARSER
- docs = []
- for document in parser.lazy_parse(blob):
- docs.append(document)
- return "\n\n".join([doc.page_content for doc in docs])
|