123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328 |
- """
- Tests document ingestion functionality in R2R across all supported file types and modes.
- Supported file types include:
- - Documents: .doc, .docx, .odt, .pdf, .rtf, .txt
- - Presentations: .ppt, .pptx
- - Spreadsheets: .csv, .tsv, .xls, .xlsx
- - Markup: .html, .md, .org, .rst
- - Images: .bmp, .heic, .jpeg, .jpg, .png, .tiff
- - Email: .eml, .msg, .p7s
- - Other: .epub, .json
- Tests verify:
- - Basic ingestion for each file type
- - Hi-res ingestion for complex documents
- - Custom ingestion configurations
- - Raw text ingestion
- - Pre-processed chunk ingestion
- - Metadata handling
- """
- import time
- from pathlib import Path
- from typing import Any, Optional
- from uuid import UUID
- import pytest
- from r2r import R2RClient, R2RException
- def file_ingestion(
- client: R2RClient,
- file_path: str,
- ingestion_mode: Optional[str] = None,
- expected_status: str = "success",
- expected_chunk_count: Optional[int] = None,
- ingestion_config: Optional[dict] = None,
- metadata: Optional[dict] = None,
- cleanup: bool = True,
- wait_for_completion: bool = True,
- timeout: int = 600,
- ) -> UUID:
- """
- Test ingestion of a file with the given parameters.
- Args:
- client: R2RClient instance
- file_path: Path to the file to ingest
- ingestion_mode: Optional ingestion mode ("fast", "hi-res", or None for default)
- expected_status: Expected final status of the document
- expected_chunk_count: Optional number of chunks to expect
- cleanup: Whether to delete the document after testing
- wait_for_completion: Whether to wait for ingestion to complete
- timeout: Maximum time to wait for ingestion completion in seconds
- Returns:
- dict: Document details after ingestion
- Raises:
- AssertionError: If any checks fail
- TimeoutError: If ingestion doesn't complete within timeout period
- """
- doc_id = None
- try:
- # Verify file exists
- assert Path(file_path).exists(), f"Test file not found: {file_path}"
- # Start ingestion
- ingest_args: dict[str, Any] = {"file_path": file_path}
- if ingestion_mode:
- ingest_args["ingestion_mode"] = ingestion_mode
- if ingestion_config:
- ingest_args["ingestion_config"] = ingestion_config
- if metadata:
- ingest_args["metadata"] = metadata
- ingestion_response = client.documents.create(**ingest_args)
- assert ingestion_response is not None
- assert "results" in ingestion_response
- assert "document_id" in ingestion_response["results"]
- doc_id = ingestion_response["results"]["document_id"]
- if wait_for_completion:
- time.sleep(2)
- start_time = time.time()
- while True:
- try:
- retrieval_response = client.documents.retrieve(id=doc_id)
- ingestion_status = retrieval_response["results"][
- "ingestion_status"
- ]
- if ingestion_status == expected_status:
- break
- elif ingestion_status == "failed":
- raise AssertionError(
- f"Document ingestion failed: {retrieval_response}"
- )
- except R2RException as e:
- if e.status_code == 404:
- # Document not yet available, continue polling if within timeout
- if time.time() - start_time > timeout:
- raise TimeoutError(
- f"Ingestion didn't complete within {timeout} seconds"
- )
- else:
- # Re-raise other errors
- raise
- time.sleep(2)
- finally:
- if cleanup and doc_id is not None:
- try:
- client.documents.delete(id=doc_id)
- except R2RException:
- # Ignore cleanup errors
- pass
- return doc_id
- @pytest.fixture(scope="session")
- def config():
- class TestConfig:
- base_url = "http://localhost:7272"
- superuser_email = "admin@example.com"
- superuser_password = "change_me_immediately"
- return TestConfig()
- @pytest.fixture(scope="session")
- def client(config):
- """Create a client instance and log in as a superuser."""
- client = R2RClient(config.base_url)
- client.users.login(config.superuser_email, config.superuser_password)
- return client
- @pytest.mark.parametrize(
- "file_type,file_path",
- [
- ("bmp", "core/examples/supported_file_types/bmp.bmp"),
- ("csv", "core/examples/supported_file_types/csv.csv"),
- ("doc", "core/examples/supported_file_types/doc.doc"),
- ("docx", "core/examples/supported_file_types/docx.docx"),
- ("eml", "core/examples/supported_file_types/eml.eml"),
- ("epub", "core/examples/supported_file_types/epub.epub"),
- ("heic", "core/examples/supported_file_types/heic.heic"),
- ("html", "core/examples/supported_file_types/html.html"),
- ("json", "core/examples/supported_file_types/json.json"),
- ("jpeg", "core/examples/supported_file_types/jpeg.jpeg"),
- ("jpg", "core/examples/supported_file_types/jpg.jpg"),
- ("md", "core/examples/supported_file_types/md.md"),
- ("msg", "core/examples/supported_file_types/msg.msg"),
- ("odt", "core/examples/supported_file_types/odt.odt"),
- ("org", "core/examples/supported_file_types/org.org"),
- ("p7s", "core/examples/supported_file_types/p7s.p7s"),
- ("pdf", "core/examples/supported_file_types/pdf.pdf"),
- ("png", "core/examples/supported_file_types/png.png"),
- ("ppt", "core/examples/supported_file_types/ppt.ppt"),
- ("pptx", "core/examples/supported_file_types/pptx.pptx"),
- ("rst", "core/examples/supported_file_types/rst.rst"),
- ("rtf", "core/examples/supported_file_types/rtf.rtf"),
- ("tiff", "core/examples/supported_file_types/tiff.tiff"),
- ("txt", "core/examples/supported_file_types/txt.txt"),
- ("tsv", "core/examples/supported_file_types/tsv.tsv"),
- ("xls", "core/examples/supported_file_types/xls.xls"),
- ("xlsx", "core/examples/supported_file_types/xlsx.xlsx"),
- ],
- )
- def test_file_type_ingestion(
- client: R2RClient, file_type: str, file_path: str
- ):
- """Test ingestion of specific file type."""
- try:
- result = file_ingestion(
- client=client,
- file_path=file_path,
- cleanup=True,
- wait_for_completion=True,
- )
- assert result is not None
- except Exception as e:
- raise
- @pytest.mark.parametrize(
- "file_type,file_path",
- [
- ("pdf", "core/examples/supported_file_types/pdf.pdf"),
- ("docx", "core/examples/supported_file_types/docx.docx"),
- ("pptx", "core/examples/supported_file_types/pptx.pptx"),
- ],
- )
- def test_hires_ingestion(client: R2RClient, file_type: str, file_path: str):
- """Test hi-res ingestion with complex documents containing mixed content."""
- if file_type == "pdf":
- try:
- result = file_ingestion(
- client=client,
- file_path=file_path,
- ingestion_mode="hi-res",
- cleanup=True,
- wait_for_completion=True,
- )
- assert result is not None
- except Exception as e: # Changed from R2RException to Exception
- if "PDF processing requires Poppler to be installed" in str(e):
- pytest.skip(
- "Skipping PDF test due to missing Poppler dependency"
- )
- raise
- else:
- result = file_ingestion(
- client=client,
- file_path=file_path,
- ingestion_mode="hi-res",
- cleanup=True,
- wait_for_completion=True,
- )
- assert result is not None
- def test_custom_ingestion_config(client: R2RClient):
- """Test ingestion with custom configuration parameters."""
- custom_config = {
- "provider": "r2r",
- "strategy": "auto",
- # "chunking_strategy": "by_title", Fixme: This was not implemented in the ingestion config
- "new_after_n_chars": 256,
- "max_characters": 512,
- "combine_under_n_chars": 64,
- "overlap": 100,
- }
- try:
- result = file_ingestion(
- client=client,
- file_path="core/examples/supported_file_types/pdf.pdf",
- ingestion_mode="custom",
- ingestion_config=custom_config,
- cleanup=True,
- wait_for_completion=True,
- )
- assert result is not None
- except Exception as e:
- raise
- def test_raw_text_ingestion(client: R2RClient):
- """Test ingestion of raw text content."""
- text_content = "This is a test document.\nIt has multiple lines.\nTesting raw text ingestion."
- response = client.documents.create(
- raw_text=text_content, ingestion_mode="fast"
- )
- assert response is not None
- assert "results" in response
- assert "document_id" in response["results"]
- doc_id = response["results"]["document_id"]
- start_time = time.time()
- while True:
- try:
- retrieval_response = client.documents.retrieve(id=doc_id)
- if retrieval_response["results"]["ingestion_status"] == "success":
- break
- except R2RException as e:
- if time.time() - start_time > 600:
- raise TimeoutError("Ingestion didn't complete within timeout")
- time.sleep(2)
- client.documents.delete(id=doc_id)
- def test_chunks_ingestion(client: R2RClient):
- """Test ingestion of pre-processed chunks."""
- chunks = ["This is chunk 1", "This is chunk 2", "This is chunk 3"]
- response = client.documents.create(chunks=chunks, ingestion_mode="fast")
- assert response is not None
- assert "results" in response
- assert "document_id" in response["results"]
- client.documents.delete(id=response["results"]["document_id"])
- def test_metadata_handling(client: R2RClient):
- """Test ingestion with metadata."""
- metadata = {
- "title": "Test Document",
- "author": "Test Author",
- "custom_field": "custom_value",
- }
- try:
- doc_id = file_ingestion(
- client=client,
- file_path="core/examples/supported_file_types/pdf.pdf",
- ingestion_mode="fast",
- metadata=metadata,
- cleanup=False,
- wait_for_completion=True,
- )
- # Update metadata with server assigned version
- metadata["version"] = "v0"
- # Verify metadata
- doc = client.documents.retrieve(id=doc_id)
- assert doc["results"]["metadata"] == metadata
- # Cleanup
- client.documents.delete(id=doc_id)
- except Exception as e:
- raise
|