| 1234567891011121314151617181920212223242526272829303132333435363738394041424344 | from langchain.document_loaders import Blobfrom langchain.document_loaders.parsers import BS4HTMLParser, PyMuPDFParserfrom langchain.document_loaders.parsers.generic import MimeTypeBasedParserfrom langchain.document_loaders.parsers.txt import TextParserPARSER_HANDLERS = {    "application/pdf": PyMuPDFParser(),    "text/plain": TextParser(),    "text/html": BS4HTMLParser(),}MIMETYPE_PARSER = MimeTypeBasedParser(    handlers=PARSER_HANDLERS,    fallback_parser=None,)def _get_mimetype(file_bytes: bytes) -> str:    try:        import magic    except ImportError:        raise ImportError(            "magic package not found, please install it with `pip install python-magic` and `brew install libmagic`"        )    mime = magic.Magic(mime=True)    mime_type = mime.from_buffer(file_bytes)    return mime_typedef load(data: bytes) -> str:    mimetype = _get_mimetype(data)    blob = Blob.from_data(        data=data,        mime_type=mimetype,    )    parser = MIMETYPE_PARSER    docs = []    for document in parser.lazy_parse(blob):        docs.append(document)    return "\n\n".join([doc.page_content for doc in docs])
 |