jack 1 ay önce
ebeveyn
işleme
6181b81a05

+ 3 - 3
py/core/providers/ingestion/unstructured/base.py

@@ -302,10 +302,10 @@ class UnstructuredIngestionProvider(IngestionProvider):
 
 
         t0 = time.time()
         t0 = time.time()
         parser_overrides = ingestion_config_override.get(
         parser_overrides = ingestion_config_override.get(
-            "parser_overrides", {"pdf": "ocr"}
+            "parser_overrides", {}
         )
         )
         elements = []
         elements = []
-        parser_overrides = {"pdf": "unstructured"}
+        #parser_overrides = {"pdf": "unstructured"}
 
 
         # TODO - Cleanup this approach to be less hardcoded
         # TODO - Cleanup this approach to be less hardcoded
         # TODO - Remove code duplication between Unstructured & R2R
         # TODO - Remove code duplication between Unstructured & R2R
@@ -315,7 +315,7 @@ class UnstructuredIngestionProvider(IngestionProvider):
         logger.info(f"R2R fallback parsers is: {document.document_type.value in self.EXTRA_PARSERS.keys()}")
         logger.info(f"R2R fallback parsers is: {document.document_type.value in self.EXTRA_PARSERS.keys()}")
         logger.info(f"R2R fallback parsers is: {document.document_type.value in parser_overrides}")
         logger.info(f"R2R fallback parsers is: {document.document_type.value in parser_overrides}")
         #if document.document_type.value in parser_overrides:
         #if document.document_type.value in parser_overrides:
-        if document.document_type.value in self.EXTRA_PARSERS.keys():
+        if document.document_type.value in parser_overrides:
             logger.info(
             logger.info(
                 f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"
                 f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"
             )
             )