jack vor 2 Monaten
Ursprung
Commit
5c3384cd82
1 geänderte Dateien mit 11 neuen und 9 gelöschten Zeilen
  1. 11 9
      py/core/providers/ingestion/unstructured/base.py

+ 11 - 9
py/core/providers/ingestion/unstructured/base.py

@@ -101,13 +101,13 @@ class UnstructuredIngestionProvider(IngestionProvider):
     }
 
     EXTRA_PARSERS = {
-        #DocumentType.CSV: {"advanced": parsers.CSVParserAdvanced},  # type: ignore
-        DocumentType.PDF: {
-            "ocr": parsers.OCRPDFParser,  # type: ignore
-            "unstructured": parsers.VLMPDFParser,  # type: ignore
-            "zerox": parsers.VLMPDFParser,  # type: ignore
-        },
-        #DocumentType.XLSX: {"advanced": parsers.XLSXParserAdvanced},  # type: ignore
+        DocumentType.CSV: {"advanced": parsers.CSVParserAdvanced},  # type: ignore
+        #DocumentType.PDF: {
+        #    "ocr": parsers.OCRPDFParser,  # type: ignore
+        #    "unstructured": parsers.VLMPDFParser,  # type: ignore
+        #    "zerox": parsers.VLMPDFParser,  # type: ignore
+        #},
+        DocumentType.XLSX: {"advanced": parsers.XLSXParserAdvanced},  # type: ignore
     }
 
     IMAGE_TYPES = {
@@ -307,8 +307,10 @@ class UnstructuredIngestionProvider(IngestionProvider):
         # TODO - Cleanup this approach to be less hardcoded
         # TODO - Remove code duplication between Unstructured & R2R
         logger.info(f"Parser overrides: {parser_overrides}")
-        logger.info(f"R2R fallback parsers is: {document.document_type.value in parser_overrides or document.document_type.value in self.EXTRA_PARSERS.keys()}")
-        if document.document_type.value in parser_overrides:
+        logger.info(f"R2R fallback parsers is: {document.document_type.value}")
+        logger.info(f"R2R fallback parsers is: {self.R2R_FALLBACK_PARSERS.keys()}")
+        logger.info(f"R2R fallback parsers is: {document.document_type.value in self.R2R_FALLBACK_PARSERS.keys()}")
+        if document.document_type.value in parser_overrides or document.document_type.value in self.EXTRA_PARSERS.keys() or document.document_type.value in self.R2R_FALLBACK_PARSERS.keys():
             '''
             logger.info(
                 f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"