test_ingestion.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480
  1. """Tests document ingestion functionality in R2R across all supported file
  2. types and modes.
  3. Supported file types include:
  4. - Documents: .doc, .docx, .odt, .pdf, .rtf, .txt
  5. - Presentations: .ppt, .pptx
  6. - Spreadsheets: .csv, .tsv, .xls, .xlsx
  7. - Markup: .html, .md, .org, .rst
  8. - Images: .bmp, .heic, .jpeg, .jpg, .png, .tiff
  9. - Email: .eml, .msg, .p7s
  10. - Other: .epub, .json
  11. Tests verify:
  12. - Basic ingestion for each file type
  13. - Hi-res ingestion for complex documents
  14. - Custom ingestion configurations
  15. - Raw text ingestion
  16. - Pre-processed chunk ingestion
  17. - Metadata handling
  18. """
  19. import time
  20. from pathlib import Path
  21. from typing import Any, Optional
  22. from uuid import UUID
  23. import pytest
  24. import contextlib
  25. from r2r import R2RClient, R2RException
  26. def file_ingestion(
  27. client: R2RClient,
  28. file_path: Optional[str] = None,
  29. ingestion_mode: Optional[str] = None,
  30. expected_status: str = "success",
  31. expected_chunk_count: Optional[int] = None,
  32. ingestion_config: Optional[dict] = None,
  33. metadata: Optional[dict] = None,
  34. cleanup: bool = True,
  35. wait_for_completion: bool = True,
  36. raw_text: Optional[str] = None,
  37. timeout: int = 600,
  38. ) -> UUID:
  39. """Test ingestion of a file with the given parameters.
  40. Args:
  41. client: R2RClient instance
  42. file_path: Path to the file to ingest
  43. ingestion_mode: Optional ingestion mode ("fast", "hi-res", or None for default)
  44. expected_status: Expected final status of the document
  45. expected_chunk_count: Optional number of chunks to expect
  46. cleanup: Whether to delete the document after testing
  47. wait_for_completion: Whether to wait for ingestion to complete
  48. timeout: Maximum time to wait for ingestion completion in seconds
  49. Returns:
  50. dict: Document details after ingestion
  51. Raises:
  52. AssertionError: If any checks fail
  53. TimeoutError: If ingestion doesn't complete within timeout period
  54. """
  55. doc_id = None
  56. try:
  57. # Verify file exists
  58. if file_path:
  59. assert Path(file_path).exists(), f"Test file not found: {file_path}"
  60. # Start ingestion
  61. ingest_args: dict[str, Any] = {"file_path": file_path}
  62. else:
  63. ingest_args = {"raw_text": raw_text}
  64. if ingestion_mode:
  65. ingest_args["ingestion_mode"] = ingestion_mode
  66. if ingestion_config:
  67. ingest_args["ingestion_config"] = ingestion_config
  68. if metadata:
  69. ingest_args["metadata"] = metadata
  70. ingestion_response = client.documents.create(**ingest_args)
  71. assert ingestion_response is not None
  72. assert ingestion_response.results is not None
  73. assert ingestion_response.results.document_id is not None
  74. doc_id = ingestion_response.results.document_id
  75. if wait_for_completion:
  76. time.sleep(2)
  77. start_time = time.time()
  78. while True:
  79. try:
  80. retrieval_response = client.documents.retrieve(id=doc_id)
  81. ingestion_status = retrieval_response.results.ingestion_status
  82. if ingestion_status == expected_status:
  83. break
  84. elif ingestion_status == "failed":
  85. raise AssertionError(
  86. f"Document ingestion failed: {retrieval_response}")
  87. except R2RException as e:
  88. if e.status_code == 404:
  89. # Document not yet available, continue polling if within timeout
  90. if time.time() - start_time > timeout:
  91. raise TimeoutError(
  92. f"Ingestion didn't complete within {timeout} seconds"
  93. )
  94. else:
  95. # Re-raise other errors
  96. raise
  97. time.sleep(2)
  98. return doc_id
  99. # except Exception as e:
  100. # raise e
  101. finally:
  102. assert doc_id is not None
  103. if cleanup and doc_id is not None:
  104. with contextlib.suppress(R2RException):
  105. client.documents.delete(id=doc_id)
  106. return doc_id
  107. @pytest.fixture(scope="session")
  108. def config():
  109. class TestConfig:
  110. base_url = "http://localhost:7272"
  111. superuser_email = "admin@example.com"
  112. superuser_password = "change_me_immediately"
  113. return TestConfig()
  114. @pytest.fixture(scope="session")
  115. def client(config):
  116. """Create a client instance and log in as a superuser."""
  117. client = R2RClient(config.base_url)
  118. client.users.login(config.superuser_email, config.superuser_password)
  119. return client
  120. @pytest.mark.parametrize(
  121. "file_type,file_path",
  122. [
  123. ("bmp", "core/examples/supported_file_types/bmp.bmp"),
  124. ("csv", "core/examples/supported_file_types/csv.csv"),
  125. ("css", "core/examples/supported_file_types/css.css"),
  126. ("doc", "core/examples/supported_file_types/doc.doc"),
  127. ("docx", "core/examples/supported_file_types/docx.docx"),
  128. ("eml", "core/examples/supported_file_types/eml.eml"),
  129. ("epub", "core/examples/supported_file_types/epub.epub"),
  130. ("heic", "core/examples/supported_file_types/heic.heic"),
  131. ("html", "core/examples/supported_file_types/html.html"),
  132. ("json", "core/examples/supported_file_types/json.json"),
  133. ("js", "core/examples/supported_file_types/js.js"),
  134. ("jpeg", "core/examples/supported_file_types/jpeg.jpeg"),
  135. ("jpg", "core/examples/supported_file_types/jpg.jpg"),
  136. ("md", "core/examples/supported_file_types/md.md"),
  137. ("msg", "core/examples/supported_file_types/msg.msg"),
  138. ("odt", "core/examples/supported_file_types/odt.odt"),
  139. ("org", "core/examples/supported_file_types/org.org"),
  140. ("p7s", "core/examples/supported_file_types/p7s.p7s"),
  141. ("pdf", "core/examples/supported_file_types/pdf.pdf"),
  142. ("png", "core/examples/supported_file_types/png.png"),
  143. ("ppt", "core/examples/supported_file_types/ppt.ppt"),
  144. ("pptx", "core/examples/supported_file_types/pptx.pptx"),
  145. ("py", "core/examples/supported_file_types/py.py"),
  146. ("rst", "core/examples/supported_file_types/rst.rst"),
  147. ("rtf", "core/examples/supported_file_types/rtf.rtf"),
  148. ("tiff", "core/examples/supported_file_types/tiff.tiff"),
  149. ("txt", "core/examples/supported_file_types/txt.txt"),
  150. ("ts", "core/examples/supported_file_types/ts.ts"),
  151. ("tsv", "core/examples/supported_file_types/tsv.tsv"),
  152. ("xls", "core/examples/supported_file_types/xls.xls"),
  153. ("xlsx", "core/examples/supported_file_types/xlsx.xlsx"),
  154. ],
  155. )
  156. def test_file_type_ingestion(client: R2RClient, file_type: str,
  157. file_path: str):
  158. """Test ingestion of specific file type."""
  159. try:
  160. result = file_ingestion(
  161. client=client,
  162. file_path=file_path,
  163. cleanup=True,
  164. wait_for_completion=True,
  165. )
  166. assert result is not None
  167. except Exception:
  168. raise
  169. @pytest.mark.parametrize(
  170. "file_type,file_path",
  171. [
  172. ("pdf", "core/examples/supported_file_types/pdf.pdf"),
  173. ],
  174. )
  175. def test_hires_ingestion(client: R2RClient, file_type: str, file_path: str):
  176. """Test hi-res ingestion with complex documents containing mixed
  177. content."""
  178. if file_type == "pdf":
  179. try:
  180. result = file_ingestion(
  181. client=client,
  182. file_path=file_path,
  183. ingestion_mode="hi-res",
  184. cleanup=True,
  185. wait_for_completion=True,
  186. )
  187. assert result is not None
  188. except Exception as e: # Changed from R2RException to Exception
  189. if "PDF processing requires Poppler to be installed" in str(e):
  190. pytest.skip(
  191. "Skipping PDF test due to missing Poppler dependency")
  192. raise
  193. else:
  194. result = file_ingestion(
  195. client=client,
  196. file_path=file_path,
  197. ingestion_mode="hi-res",
  198. cleanup=True,
  199. wait_for_completion=True,
  200. )
  201. assert result is not None
  202. @pytest.mark.parametrize(
  203. "file_type,file_path",
  204. [
  205. ("pdf", "core/examples/supported_file_types/pdf.pdf"),
  206. ],
  207. )
  208. def test_ocr_ingestion(client: R2RClient, file_type: str, file_path: str):
  209. """Test ocr ingestion for a pdf file."""
  210. result = file_ingestion(
  211. client=client,
  212. file_path=file_path,
  213. ingestion_mode="ocr",
  214. cleanup=True,
  215. wait_for_completion=True,
  216. )
  217. assert result is not None
  218. def test_custom_ingestion_config(client: R2RClient):
  219. """Test ingestion with custom configuration parameters."""
  220. custom_config = {
  221. "provider": "r2r",
  222. "strategy": "auto",
  223. # "chunking_strategy": "by_title", Fixme: This was not implemented in the ingestion config
  224. "new_after_n_chars": 256,
  225. "max_characters": 512,
  226. "combine_under_n_chars": 64,
  227. "overlap": 100,
  228. }
  229. try:
  230. result = file_ingestion(
  231. client=client,
  232. # file_path="core/examples/supported_file_types/pdf.pdf",
  233. raw_text="This is a test document.",
  234. ingestion_mode="custom",
  235. ingestion_config=custom_config,
  236. cleanup=True,
  237. wait_for_completion=True,
  238. )
  239. assert result is not None
  240. except Exception:
  241. raise
  242. def test_raw_text_ingestion(client: R2RClient):
  243. """Test ingestion of raw text content."""
  244. text_content = "This is a test document.\nIt has multiple lines.\nTesting raw text ingestion."
  245. response = client.documents.create(raw_text=text_content,
  246. ingestion_mode="fast")
  247. assert response is not None
  248. assert response.results is not None
  249. assert response.results.document_id is not None
  250. doc_id = response.results.document_id
  251. start_time = time.time()
  252. while True:
  253. try:
  254. retrieval_response = client.documents.retrieve(id=doc_id)
  255. if retrieval_response.results.ingestion_status == "success":
  256. break
  257. except R2RException:
  258. if time.time() - start_time > 600:
  259. raise TimeoutError("Ingestion didn't complete within timeout")
  260. time.sleep(2)
  261. client.documents.delete(id=doc_id)
  262. def test_chunks_ingestion(client: R2RClient):
  263. """Test ingestion of pre-processed chunks."""
  264. chunks = ["This is chunk 1", "This is chunk 2", "This is chunk 3"]
  265. response = client.documents.create(chunks=chunks, ingestion_mode="fast")
  266. assert response is not None
  267. assert response.results is not None
  268. assert response.results.document_id is not None
  269. client.documents.delete(id=response.results.document_id)
  270. def test_metadata_handling(client: R2RClient):
  271. """Test ingestion with metadata."""
  272. metadata = {
  273. "title": "Test Document",
  274. "author": "Test Author",
  275. "custom_field": "custom_value",
  276. }
  277. try:
  278. doc_id = file_ingestion(
  279. client=client,
  280. # file_path="core/examples/supported_file_types/pdf.pdf",
  281. raw_text="this is test text " + str(time.time()),
  282. ingestion_mode="fast",
  283. metadata=metadata,
  284. cleanup=False,
  285. wait_for_completion=True,
  286. )
  287. # Update metadata with server assigned version
  288. metadata["version"] = "v0"
  289. # Verify metadata
  290. doc = client.documents.retrieve(id=doc_id)
  291. assert doc.results.metadata == metadata
  292. # Cleanup
  293. client.documents.delete(id=doc_id)
  294. except Exception:
  295. raise
  296. def test_img_ingestion(client: R2RClient):
  297. """Test ingestion with metadata."""
  298. with contextlib.suppress(R2RException):
  299. client.documents.delete("65bd45b7-632b-5874-9510-82b4e97b4abc")
  300. result = client.documents.create(
  301. file_path="core/examples/supported_file_types/png.png",
  302. metadata={"title": "Test Document", "author": "Test Author"},
  303. ingestion_config={"vlm":"openai/gpt-4.1"},
  304. run_with_orchestration=False
  305. )
  306. with contextlib.suppress(R2RException):
  307. client.documents.delete(result.results.document_id)
  308. # Commenting out due to lack of Anthropic API Key in the CI/CD environment.
  309. # result = client.documents.create(
  310. # file_path="core/examples/supported_file_types/png.png",
  311. # metadata={"title": "Test Document", "author": "Test Author"},
  312. # ingestion_config={"vlm":"anthropic/anthropic/claude-3-7-sonnet-20250219"},
  313. # run_with_orchestration=False
  314. # )
  315. # with contextlib.suppress(R2RException):
  316. # client.documents.delete(result.results.document_id)
  317. def test_metadata_title_handling(client: R2RClient):
  318. """Test that document title in metadata is properly stored and retrievable."""
  319. # Test with raw text
  320. raw_text_title = "Raw Text Title Test"
  321. raw_text_metadata = {
  322. "title": raw_text_title,
  323. "author": "Test Author",
  324. "custom_field": "custom_value",
  325. }
  326. # Create document with raw text
  327. raw_text_response = client.documents.create(
  328. raw_text="This is test text with title " + str(time.time()),
  329. ingestion_mode="fast",
  330. metadata=raw_text_metadata,
  331. run_with_orchestration=False
  332. )
  333. assert raw_text_response is not None
  334. assert raw_text_response.results is not None
  335. raw_text_doc_id = raw_text_response.results.document_id
  336. # Wait for ingestion to complete
  337. start_time = time.time()
  338. while True:
  339. try:
  340. retrieval_response = client.documents.retrieve(id=raw_text_doc_id)
  341. if retrieval_response.results.ingestion_status == "success":
  342. break
  343. elif retrieval_response.results.ingestion_status == "failed":
  344. raise AssertionError(f"Document ingestion failed: {retrieval_response}")
  345. except R2RException:
  346. if time.time() - start_time > 600:
  347. raise TimeoutError("Ingestion didn't complete within timeout")
  348. time.sleep(2)
  349. # Verify document in list has correct title
  350. list_response = client.documents.list()
  351. raw_text_doc = next((doc for doc in list_response.results
  352. if doc.id == raw_text_doc_id), None)
  353. assert raw_text_doc is not None
  354. assert raw_text_doc.title == raw_text_title
  355. # Verify retrieved document has correct title in metadata
  356. raw_text_doc_detail = client.documents.retrieve(id=raw_text_doc_id)
  357. # Update metadata with server assigned version
  358. raw_text_metadata["version"] = "v0"
  359. assert raw_text_doc_detail.results.metadata == raw_text_metadata
  360. # Test with chunks
  361. chunks_title = "Chunks Title Test"
  362. chunks_metadata = {
  363. "title": chunks_title,
  364. "author": "Test Author",
  365. "custom_field": "custom_value",
  366. }
  367. # Create document with chunks
  368. chunks = ["This is chunk 1 " + str(time.time()),
  369. "This is chunk 2",
  370. "This is chunk 3"]
  371. chunks_response = client.documents.create(
  372. chunks=chunks,
  373. ingestion_mode="fast",
  374. metadata=chunks_metadata,
  375. run_with_orchestration=False
  376. )
  377. assert chunks_response is not None
  378. assert chunks_response.results is not None
  379. chunks_doc_id = chunks_response.results.document_id
  380. # Wait for ingestion to complete
  381. start_time = time.time()
  382. while True:
  383. try:
  384. retrieval_response = client.documents.retrieve(id=chunks_doc_id)
  385. if retrieval_response.results.ingestion_status == "success":
  386. break
  387. elif retrieval_response.results.ingestion_status == "failed":
  388. raise AssertionError(f"Document ingestion failed: {retrieval_response}")
  389. except R2RException:
  390. if time.time() - start_time > 600:
  391. raise TimeoutError("Ingestion didn't complete within timeout")
  392. time.sleep(2)
  393. # Verify document in list has correct title
  394. list_response = client.documents.list()
  395. chunks_doc = next((doc for doc in list_response.results
  396. if doc.id == chunks_doc_id), None)
  397. assert chunks_doc is not None
  398. assert chunks_doc.title == chunks_title
  399. # Verify retrieved document has correct title in metadata
  400. chunks_doc_detail = client.documents.retrieve(id=chunks_doc_id)
  401. # Update metadata with server assigned version
  402. chunks_metadata["version"] = "v0"
  403. assert chunks_doc_detail.results.metadata == chunks_metadata
  404. # Clean up
  405. client.documents.delete(id=raw_text_doc_id)
  406. client.documents.delete(id=chunks_doc_id)