jack hace 1 día
padre
commit
2ddcc563a0
Se han modificado 1 ficheros con 2 adiciones y 2 borrados
  1. 2 2
      py/core/providers/ingestion/unstructured/base.py

+ 2 - 2
py/core/providers/ingestion/unstructured/base.py

@@ -104,7 +104,7 @@ class UnstructuredIngestionProvider(IngestionProvider):
         DocumentType.CSV: {"advanced": parsers.CSVParserAdvanced},  # type: ignore
         DocumentType.PDF: {
             "ocr": parsers.OCRPDFParser,  # type: ignore
-            "unstructured": parsers.OCRPDFParser,  # type: ignore
+            "unstructured": parsers.VLMPDFParser,  # type: ignore
             "zerox": parsers.VLMPDFParser,  # type: ignore
         },
         DocumentType.XLSX: {"advanced": parsers.XLSXParserAdvanced},  # type: ignore
@@ -308,7 +308,7 @@ class UnstructuredIngestionProvider(IngestionProvider):
         # TODO - Remove code duplication between Unstructured & R2R
         logger.info(f"Parser overrides: {parser_overrides}")
         logger.info(f"R2R fallback parsers: {document.document_type.value in self.EXTRA_PARSERS.keys()}")
-        if document.document_type.value in parser_overrides: ##  or document.document_type.value in self.EXTRA_PARSERS.keys()
+        if document.document_type.value in parser_overrides or document.document_type.value in self.EXTRA_PARSERS.keys(): ##  or document.document_type.value in self.EXTRA_PARSERS.keys()
             logger.info(
                 f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"
             )