jack vor 2 Monaten
Ursprung
Commit
0e0e6cf368
1 geänderte Dateien mit 6 neuen und 16 gelöschten Zeilen
  1. 6 16
      py/core/providers/ingestion/unstructured/base.py

+ 6 - 16
py/core/providers/ingestion/unstructured/base.py

@@ -95,9 +95,11 @@ class UnstructuredIngestionProvider(IngestionProvider):
         DocumentType.JSON: [parsers.JSONParser],  # type: ignore
         DocumentType.HTML: [parsers.HTMLParser],  # type: ignore
         DocumentType.XLS: [parsers.XLSParser],  # type: ignore
-        DocumentType.XLSX: [parsers.XLSXParser],  # type: ignore
+        #DocumentType.XLSX: [parsers.XLSXParser],  # type: ignore
         #DocumentType.DOC: [parsers.DOCParser],  # type: ignore
         DocumentType.PPT: [parsers.PPTParser],  # type: ignore
+        DocumentType.CSV: [parsers.CSVParserAdvanced],  # type: ignore
+        DocumentType.XLSX: [parsers.XLSXParserAdvanced],  # type: ignore
     }
 
     EXTRA_PARSERS = {
@@ -308,10 +310,9 @@ class UnstructuredIngestionProvider(IngestionProvider):
         # TODO - Remove code duplication between Unstructured & R2R
         logger.info(f"Parser overrides: {parser_overrides}")
         logger.info(f"R2R fallback parsers is: {document.document_type.value}")
-        logger.info(f"R2R fallback parsers is: {self.R2R_FALLBACK_PARSERS.keys()}")
-        logger.info(f"R2R fallback parsers is: {document.document_type.value in self.R2R_FALLBACK_PARSERS.keys()}")
-        if document.document_type.value in parser_overrides or document.document_type.value in self.EXTRA_PARSERS.keys() or document.document_type.value in self.R2R_FALLBACK_PARSERS.keys():
-            '''
+        logger.info(f"R2R fallback parsers is: {self.EXTRA_PARSERS.keys()}")
+        logger.info(f"R2R fallback parsers is: {document.document_type.value in self.EXTRA_PARSERS.keys()}")
+        if document.document_type.value in parser_overrides:
             logger.info(
                 f"Using parser_override for {document.document_type} with input value {parser_overrides[document.document_type.value]}"
             )
@@ -336,17 +337,6 @@ class UnstructuredIngestionProvider(IngestionProvider):
                     )
                     elements.append(element)
 
-            async for element in self.parse_fallback(
-                    file_content,
-                    ingestion_config=ingestion_config,
-                    parser_name=f"zerox_{DocumentType.PDF.value}",
-                ):
-                    logger.warning(
-                        f"Using parser_override for {document.document_type}"
-                    )
-                    elements.append(element)
-            '''
-
         elif document.document_type in self.R2R_FALLBACK_PARSERS.keys():
             logger.info(
                 f"Parsing {document.document_type}: {document.id} with fallback parser"