jack пре 1 недеља
родитељ
комит
3341427aa7
1 измењених фајлова са 6 додато и 6 уклоњено
  1. 6 6
      py/core/providers/ingestion/unstructured/base.py

+ 6 - 6
py/core/providers/ingestion/unstructured/base.py

@@ -103,11 +103,11 @@ class UnstructuredIngestionProvider(IngestionProvider):
 
     EXTRA_PARSERS = {
         #DocumentType.CSV: {"advanced": parsers.CSVParserAdvanced},  # type: ignore
-        #DocumentType.PDF: {
-        #    "ocr": parsers.OCRPDFParser,  # type: ignore
-       #     "unstructured": parsers.PDFParserUnstructured,  # type: ignore
-       #     "zerox": parsers.VLMPDFParser,  # type: ignore
-        #},
+        DocumentType.PDF: {
+            "ocr": parsers.OCRPDFParser,  # type: ignore
+            "unstructured": parsers.PDFParserUnstructured,  # type: ignore
+            "zerox": parsers.VLMPDFParser,  # type: ignore
+        }
         #DocumentType.XLSX: {"advanced": parsers.XLSXParserAdvanced},  # type: ignore
     }
 
@@ -301,7 +301,7 @@ class UnstructuredIngestionProvider(IngestionProvider):
 
         t0 = time.time()
         parser_overrides = ingestion_config_override.get(
-            "parser_overrides", {}
+            "parser_overrides", {"pdf": "ocr"}
         )
         elements = []