|
|
@@ -101,13 +101,13 @@ class UnstructuredIngestionProvider(IngestionProvider):
|
|
|
}
|
|
|
|
|
|
EXTRA_PARSERS = {
|
|
|
- #DocumentType.CSV: {"advanced": parsers.CSVParserAdvanced}, # type: ignore
|
|
|
- DocumentType.PDF: {
|
|
|
- "ocr": parsers.OCRPDFParser, # type: ignore
|
|
|
- "unstructured": parsers.VLMPDFParser, # type: ignore
|
|
|
- "zerox": parsers.VLMPDFParser, # type: ignore
|
|
|
- },
|
|
|
- #DocumentType.XLSX: {"advanced": parsers.XLSXParserAdvanced}, # type: ignore
|
|
|
+ DocumentType.CSV: {"advanced": parsers.CSVParserAdvanced}, # type: ignore
|
|
|
+ #DocumentType.PDF: {
|
|
|
+ # "ocr": parsers.OCRPDFParser, # type: ignore
|
|
|
+ # "unstructured": parsers.VLMPDFParser, # type: ignore
|
|
|
+ # "zerox": parsers.VLMPDFParser, # type: ignore
|
|
|
+ #},
|
|
|
+ DocumentType.XLSX: {"advanced": parsers.XLSXParserAdvanced}, # type: ignore
|
|
|
}
|
|
|
|
|
|
IMAGE_TYPES = {
|
|
|
@@ -307,8 +307,10 @@ class UnstructuredIngestionProvider(IngestionProvider):
|
|
|
# TODO - Cleanup this approach to be less hardcoded
|
|
|
# TODO - Remove code duplication between Unstructured & R2R
|
|
|
logger.info(f"Parser overrides: {parser_overrides}")
|
|
|
- logger.info(f"R2R fallback parsers is: {document.document_type.value in parser_overrides or document.document_type.value in self.EXTRA_PARSERS.keys()}")
|
|
|
- if document.document_type.value in parser_overrides:
|
|
|
+ logger.info(f"R2R fallback parsers is: {document.document_type.value}")
|
|
|
+ logger.info(f"R2R fallback parsers is: {self.R2R_FALLBACK_PARSERS.keys()}")
|
|
|
+ logger.info(f"R2R fallback parsers is: {document.document_type.value in self.R2R_FALLBACK_PARSERS.keys()}")
|
|
|
+ if document.document_type.value in parser_overrides or document.document_type.value in self.EXTRA_PARSERS.keys() or document.document_type.value in self.R2R_FALLBACK_PARSERS.keys():
|
|
|
'''
|
|
|
logger.info(
|
|
|
f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"
|