|
|
@@ -95,9 +95,11 @@ class UnstructuredIngestionProvider(IngestionProvider):
|
|
|
DocumentType.JSON: [parsers.JSONParser], # type: ignore
|
|
|
DocumentType.HTML: [parsers.HTMLParser], # type: ignore
|
|
|
DocumentType.XLS: [parsers.XLSParser], # type: ignore
|
|
|
- DocumentType.XLSX: [parsers.XLSXParser], # type: ignore
|
|
|
+ #DocumentType.XLSX: [parsers.XLSXParser], # type: ignore
|
|
|
#DocumentType.DOC: [parsers.DOCParser], # type: ignore
|
|
|
DocumentType.PPT: [parsers.PPTParser], # type: ignore
|
|
|
+ DocumentType.CSV: [parsers.CSVParserAdvanced], # type: ignore
|
|
|
+ DocumentType.XLSX: [parsers.XLSXParserAdvanced], # type: ignore
|
|
|
}
|
|
|
|
|
|
EXTRA_PARSERS = {
|
|
|
@@ -308,10 +310,9 @@ class UnstructuredIngestionProvider(IngestionProvider):
|
|
|
# TODO - Remove code duplication between Unstructured & R2R
|
|
|
logger.info(f"Parser overrides: {parser_overrides}")
|
|
|
logger.info(f"R2R fallback parsers is: {document.document_type.value}")
|
|
|
- logger.info(f"R2R fallback parsers is: {self.R2R_FALLBACK_PARSERS.keys()}")
|
|
|
- logger.info(f"R2R fallback parsers is: {document.document_type.value in self.R2R_FALLBACK_PARSERS.keys()}")
|
|
|
- if document.document_type.value in parser_overrides or document.document_type.value in self.EXTRA_PARSERS.keys() or document.document_type.value in self.R2R_FALLBACK_PARSERS.keys():
|
|
|
- '''
|
|
|
+ logger.info(f"R2R fallback parsers is: {self.EXTRA_PARSERS.keys()}")
|
|
|
+ logger.info(f"R2R fallback parsers is: {document.document_type.value in self.EXTRA_PARSERS.keys()}")
|
|
|
+ if document.document_type.value in parser_overrides:
|
|
|
logger.info(
|
|
|
f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"
|
|
|
)
|
|
|
@@ -336,17 +337,6 @@ class UnstructuredIngestionProvider(IngestionProvider):
|
|
|
)
|
|
|
elements.append(element)
|
|
|
|
|
|
- async for element in self.parse_fallback(
|
|
|
- file_content,
|
|
|
- ingestion_config=ingestion_config,
|
|
|
- parser_name=f"zerox_{DocumentType.PDF.value}",
|
|
|
- ):
|
|
|
- logger.warning(
|
|
|
- f"Using parser_override for {document.document_type}"
|
|
|
- )
|
|
|
- elements.append(element)
|
|
|
- '''
|
|
|
-
|
|
|
elif document.document_type in self.R2R_FALLBACK_PARSERS.keys():
|
|
|
logger.info(
|
|
|
f"Parsing {document.document_type}: {document.id} with fallback parser"
|