|
@@ -103,11 +103,11 @@ class UnstructuredIngestionProvider(IngestionProvider):
|
|
|
|
|
|
|
|
EXTRA_PARSERS = {
|
|
EXTRA_PARSERS = {
|
|
|
#DocumentType.CSV: {"advanced": parsers.CSVParserAdvanced}, # type: ignore
|
|
#DocumentType.CSV: {"advanced": parsers.CSVParserAdvanced}, # type: ignore
|
|
|
- #DocumentType.PDF: {
|
|
|
|
|
- # "ocr": parsers.OCRPDFParser, # type: ignore
|
|
|
|
|
- # "unstructured": parsers.PDFParserUnstructured, # type: ignore
|
|
|
|
|
- # "zerox": parsers.VLMPDFParser, # type: ignore
|
|
|
|
|
- #},
|
|
|
|
|
|
|
+ DocumentType.PDF: {
|
|
|
|
|
+ "ocr": parsers.OCRPDFParser, # type: ignore
|
|
|
|
|
+ "unstructured": parsers.PDFParserUnstructured, # type: ignore
|
|
|
|
|
+ "zerox": parsers.VLMPDFParser, # type: ignore
|
|
|
|
|
+ }
|
|
|
#DocumentType.XLSX: {"advanced": parsers.XLSXParserAdvanced}, # type: ignore
|
|
#DocumentType.XLSX: {"advanced": parsers.XLSXParserAdvanced}, # type: ignore
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -301,7 +301,7 @@ class UnstructuredIngestionProvider(IngestionProvider):
|
|
|
|
|
|
|
|
t0 = time.time()
|
|
t0 = time.time()
|
|
|
parser_overrides = ingestion_config_override.get(
|
|
parser_overrides = ingestion_config_override.get(
|
|
|
- "parser_overrides", {}
|
|
|
|
|
|
|
+ "parser_overrides", {"pdf": "ocr"}
|
|
|
)
|
|
)
|
|
|
elements = []
|
|
elements = []
|
|
|
|
|
|