|
@@ -104,7 +104,7 @@ class UnstructuredIngestionProvider(IngestionProvider):
|
|
DocumentType.CSV: {"advanced": parsers.CSVParserAdvanced}, # type: ignore
|
|
DocumentType.CSV: {"advanced": parsers.CSVParserAdvanced}, # type: ignore
|
|
DocumentType.PDF: {
|
|
DocumentType.PDF: {
|
|
"ocr": parsers.OCRPDFParser, # type: ignore
|
|
"ocr": parsers.OCRPDFParser, # type: ignore
|
|
- "unstructured": parsers.OCRPDFParser, # type: ignore
|
|
|
|
|
|
+ "unstructured": parsers.VLMPDFParser, # type: ignore
|
|
"zerox": parsers.VLMPDFParser, # type: ignore
|
|
"zerox": parsers.VLMPDFParser, # type: ignore
|
|
},
|
|
},
|
|
DocumentType.XLSX: {"advanced": parsers.XLSXParserAdvanced}, # type: ignore
|
|
DocumentType.XLSX: {"advanced": parsers.XLSXParserAdvanced}, # type: ignore
|
|
@@ -308,7 +308,7 @@ class UnstructuredIngestionProvider(IngestionProvider):
|
|
# TODO - Remove code duplication between Unstructured & R2R
|
|
# TODO - Remove code duplication between Unstructured & R2R
|
|
logger.info(f"Parser overrides: {parser_overrides}")
|
|
logger.info(f"Parser overrides: {parser_overrides}")
|
|
logger.info(f"R2R fallback parsers: {document.document_type.value in self.EXTRA_PARSERS.keys()}")
|
|
logger.info(f"R2R fallback parsers: {document.document_type.value in self.EXTRA_PARSERS.keys()}")
|
|
- if document.document_type.value in parser_overrides: ## or document.document_type.value in self.EXTRA_PARSERS.keys()
|
|
|
|
|
|
+ if document.document_type.value in parser_overrides or document.document_type.value in self.EXTRA_PARSERS.keys(): ## or document.document_type.value in self.EXTRA_PARSERS.keys()
|
|
logger.info(
|
|
logger.info(
|
|
f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"
|
|
f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"
|
|
)
|
|
)
|