jack 1 dag geleden
bovenliggende
commit
ec1a58667b
1 gewijzigde bestanden met toevoegingen van 12 en 1 verwijderingen
  1. 12 1
      py/core/providers/ingestion/unstructured/base.py

+ 12 - 1
py/core/providers/ingestion/unstructured/base.py

@@ -306,7 +306,8 @@ class UnstructuredIngestionProvider(IngestionProvider):
 
         # TODO - Cleanup this approach to be less hardcoded
         # TODO - Remove code duplication between Unstructured & R2R
-        if document.document_type.value in parser_overrides:
+        logger.info(f"Parser overrides: {parser_overrides}")
+        if document.document_type.value in parser_overrides or document.document_type.value in self.R2R_FALLBACK_PARSERS.keys():
             logger.info(
                 f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"
             )
@@ -330,6 +331,16 @@ class UnstructuredIngestionProvider(IngestionProvider):
                         f"Using OCR parser_override for {document.document_type}"
                     )
                     elements.append(element)
+            else:
+                async for element in self.parse_fallback(
+                    file_content,
+                    ingestion_config=ingestion_config,
+                    parser_name=f"ocr_{DocumentType.PDF.value}",
+                ):
+                    logger.warning(
+                        f"Using OCR parser_override for {document.document_type}"
+                    )
+                    elements.append(element)
 
         elif document.document_type in self.R2R_FALLBACK_PARSERS.keys():
             logger.info(