doc_loader.py 1.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344
  1. from langchain.document_loaders import Blob
  2. from langchain.document_loaders.parsers import BS4HTMLParser, PyMuPDFParser
  3. from langchain.document_loaders.parsers.generic import MimeTypeBasedParser
  4. from langchain.document_loaders.parsers.txt import TextParser
  5. PARSER_HANDLERS = {
  6. "application/pdf": PyMuPDFParser(),
  7. "text/plain": TextParser(),
  8. "text/html": BS4HTMLParser(),
  9. }
  10. MIMETYPE_PARSER = MimeTypeBasedParser(
  11. handlers=PARSER_HANDLERS,
  12. fallback_parser=None,
  13. )
  14. def _get_mimetype(file_bytes: bytes) -> str:
  15. try:
  16. import magic
  17. except ImportError:
  18. raise ImportError(
  19. "magic package not found, please install it with `pip install python-magic` and `brew install libmagic`"
  20. )
  21. mime = magic.Magic(mime=True)
  22. mime_type = mime.from_buffer(file_bytes)
  23. return mime_type
  24. def load(data: bytes) -> str:
  25. mimetype = _get_mimetype(data)
  26. blob = Blob.from_data(
  27. data=data,
  28. mime_type=mimetype,
  29. )
  30. parser = MIMETYPE_PARSER
  31. docs = []
  32. for document in parser.lazy_parse(blob):
  33. docs.append(document)
  34. return "\n\n".join([doc.page_content for doc in docs])