test_ingestion.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. """
  2. Tests document ingestion functionality in R2R across all supported file types and modes.
  3. Supported file types include:
  4. - Documents: .doc, .docx, .odt, .pdf, .rtf, .txt
  5. - Presentations: .ppt, .pptx
  6. - Spreadsheets: .csv, .tsv, .xls, .xlsx
  7. - Markup: .html, .md, .org, .rst
  8. - Images: .bmp, .heic, .jpeg, .jpg, .png, .tiff
  9. - Email: .eml, .msg, .p7s
  10. - Other: .epub, .json
  11. Tests verify:
  12. - Basic ingestion for each file type
  13. - Hi-res ingestion for complex documents
  14. - Custom ingestion configurations
  15. - Raw text ingestion
  16. - Pre-processed chunk ingestion
  17. - Metadata handling
  18. """
  19. import time
  20. from pathlib import Path
  21. from typing import Any, Optional
  22. from uuid import UUID
  23. import pytest
  24. from r2r import R2RClient, R2RException
  25. def file_ingestion(
  26. client: R2RClient,
  27. file_path: str,
  28. ingestion_mode: Optional[str] = None,
  29. expected_status: str = "success",
  30. expected_chunk_count: Optional[int] = None,
  31. ingestion_config: Optional[dict] = None,
  32. metadata: Optional[dict] = None,
  33. cleanup: bool = True,
  34. wait_for_completion: bool = True,
  35. timeout: int = 600,
  36. ) -> UUID:
  37. """
  38. Test ingestion of a file with the given parameters.
  39. Args:
  40. client: R2RClient instance
  41. file_path: Path to the file to ingest
  42. ingestion_mode: Optional ingestion mode ("fast", "hi-res", or None for default)
  43. expected_status: Expected final status of the document
  44. expected_chunk_count: Optional number of chunks to expect
  45. cleanup: Whether to delete the document after testing
  46. wait_for_completion: Whether to wait for ingestion to complete
  47. timeout: Maximum time to wait for ingestion completion in seconds
  48. Returns:
  49. dict: Document details after ingestion
  50. Raises:
  51. AssertionError: If any checks fail
  52. TimeoutError: If ingestion doesn't complete within timeout period
  53. """
  54. doc_id = None
  55. try:
  56. # Verify file exists
  57. assert Path(file_path).exists(), f"Test file not found: {file_path}"
  58. # Start ingestion
  59. ingest_args: dict[str, Any] = {"file_path": file_path}
  60. if ingestion_mode:
  61. ingest_args["ingestion_mode"] = ingestion_mode
  62. if ingestion_config:
  63. ingest_args["ingestion_config"] = ingestion_config
  64. if metadata:
  65. ingest_args["metadata"] = metadata
  66. ingestion_response = client.documents.create(**ingest_args)
  67. assert ingestion_response is not None
  68. assert "results" in ingestion_response
  69. assert "document_id" in ingestion_response["results"]
  70. doc_id = ingestion_response["results"]["document_id"]
  71. if wait_for_completion:
  72. time.sleep(2)
  73. start_time = time.time()
  74. while True:
  75. try:
  76. retrieval_response = client.documents.retrieve(id=doc_id)
  77. ingestion_status = retrieval_response["results"][
  78. "ingestion_status"
  79. ]
  80. if ingestion_status == expected_status:
  81. break
  82. elif ingestion_status == "failed":
  83. raise AssertionError(
  84. f"Document ingestion failed: {retrieval_response}"
  85. )
  86. except R2RException as e:
  87. if e.status_code == 404:
  88. # Document not yet available, continue polling if within timeout
  89. if time.time() - start_time > timeout:
  90. raise TimeoutError(
  91. f"Ingestion didn't complete within {timeout} seconds"
  92. )
  93. else:
  94. # Re-raise other errors
  95. raise
  96. time.sleep(2)
  97. finally:
  98. if cleanup and doc_id is not None:
  99. try:
  100. client.documents.delete(id=doc_id)
  101. except R2RException:
  102. # Ignore cleanup errors
  103. pass
  104. return doc_id
  105. @pytest.fixture(scope="session")
  106. def config():
  107. class TestConfig:
  108. base_url = "http://localhost:7272"
  109. superuser_email = "admin@example.com"
  110. superuser_password = "change_me_immediately"
  111. return TestConfig()
  112. @pytest.fixture(scope="session")
  113. def client(config):
  114. """Create a client instance and log in as a superuser."""
  115. client = R2RClient(config.base_url)
  116. client.users.login(config.superuser_email, config.superuser_password)
  117. return client
  118. @pytest.mark.parametrize(
  119. "file_type,file_path",
  120. [
  121. ("bmp", "core/examples/supported_file_types/bmp.bmp"),
  122. ("csv", "core/examples/supported_file_types/csv.csv"),
  123. ("doc", "core/examples/supported_file_types/doc.doc"),
  124. ("docx", "core/examples/supported_file_types/docx.docx"),
  125. ("eml", "core/examples/supported_file_types/eml.eml"),
  126. ("epub", "core/examples/supported_file_types/epub.epub"),
  127. ("heic", "core/examples/supported_file_types/heic.heic"),
  128. ("html", "core/examples/supported_file_types/html.html"),
  129. ("json", "core/examples/supported_file_types/json.json"),
  130. ("jpeg", "core/examples/supported_file_types/jpeg.jpeg"),
  131. ("jpg", "core/examples/supported_file_types/jpg.jpg"),
  132. ("md", "core/examples/supported_file_types/md.md"),
  133. ("msg", "core/examples/supported_file_types/msg.msg"),
  134. ("odt", "core/examples/supported_file_types/odt.odt"),
  135. ("org", "core/examples/supported_file_types/org.org"),
  136. ("p7s", "core/examples/supported_file_types/p7s.p7s"),
  137. ("pdf", "core/examples/supported_file_types/pdf.pdf"),
  138. ("png", "core/examples/supported_file_types/png.png"),
  139. ("ppt", "core/examples/supported_file_types/ppt.ppt"),
  140. ("pptx", "core/examples/supported_file_types/pptx.pptx"),
  141. ("rst", "core/examples/supported_file_types/rst.rst"),
  142. ("rtf", "core/examples/supported_file_types/rtf.rtf"),
  143. ("tiff", "core/examples/supported_file_types/tiff.tiff"),
  144. ("txt", "core/examples/supported_file_types/txt.txt"),
  145. ("tsv", "core/examples/supported_file_types/tsv.tsv"),
  146. ("xls", "core/examples/supported_file_types/xls.xls"),
  147. ("xlsx", "core/examples/supported_file_types/xlsx.xlsx"),
  148. ],
  149. )
  150. def test_file_type_ingestion(
  151. client: R2RClient, file_type: str, file_path: str
  152. ):
  153. """Test ingestion of specific file type."""
  154. try:
  155. result = file_ingestion(
  156. client=client,
  157. file_path=file_path,
  158. cleanup=True,
  159. wait_for_completion=True,
  160. )
  161. assert result is not None
  162. except Exception as e:
  163. raise
  164. @pytest.mark.parametrize(
  165. "file_type,file_path",
  166. [
  167. ("pdf", "core/examples/supported_file_types/pdf.pdf"),
  168. ("docx", "core/examples/supported_file_types/docx.docx"),
  169. ("pptx", "core/examples/supported_file_types/pptx.pptx"),
  170. ],
  171. )
  172. def test_hires_ingestion(client: R2RClient, file_type: str, file_path: str):
  173. """Test hi-res ingestion with complex documents containing mixed content."""
  174. if file_type == "pdf":
  175. try:
  176. result = file_ingestion(
  177. client=client,
  178. file_path=file_path,
  179. ingestion_mode="hi-res",
  180. cleanup=True,
  181. wait_for_completion=True,
  182. )
  183. assert result is not None
  184. except Exception as e: # Changed from R2RException to Exception
  185. if "PDF processing requires Poppler to be installed" in str(e):
  186. pytest.skip(
  187. "Skipping PDF test due to missing Poppler dependency"
  188. )
  189. raise
  190. else:
  191. result = file_ingestion(
  192. client=client,
  193. file_path=file_path,
  194. ingestion_mode="hi-res",
  195. cleanup=True,
  196. wait_for_completion=True,
  197. )
  198. assert result is not None
  199. def test_custom_ingestion_config(client: R2RClient):
  200. """Test ingestion with custom configuration parameters."""
  201. custom_config = {
  202. "provider": "r2r",
  203. "strategy": "auto",
  204. # "chunking_strategy": "by_title", Fixme: This was not implemented in the ingestion config
  205. "new_after_n_chars": 256,
  206. "max_characters": 512,
  207. "combine_under_n_chars": 64,
  208. "overlap": 100,
  209. }
  210. try:
  211. result = file_ingestion(
  212. client=client,
  213. file_path="core/examples/supported_file_types/pdf.pdf",
  214. ingestion_mode="custom",
  215. ingestion_config=custom_config,
  216. cleanup=True,
  217. wait_for_completion=True,
  218. )
  219. assert result is not None
  220. except Exception as e:
  221. raise
  222. def test_raw_text_ingestion(client: R2RClient):
  223. """Test ingestion of raw text content."""
  224. text_content = "This is a test document.\nIt has multiple lines.\nTesting raw text ingestion."
  225. response = client.documents.create(
  226. raw_text=text_content, ingestion_mode="fast"
  227. )
  228. assert response is not None
  229. assert "results" in response
  230. assert "document_id" in response["results"]
  231. doc_id = response["results"]["document_id"]
  232. start_time = time.time()
  233. while True:
  234. try:
  235. retrieval_response = client.documents.retrieve(id=doc_id)
  236. if retrieval_response["results"]["ingestion_status"] == "success":
  237. break
  238. except R2RException as e:
  239. if time.time() - start_time > 600:
  240. raise TimeoutError("Ingestion didn't complete within timeout")
  241. time.sleep(2)
  242. client.documents.delete(id=doc_id)
  243. def test_chunks_ingestion(client: R2RClient):
  244. """Test ingestion of pre-processed chunks."""
  245. chunks = ["This is chunk 1", "This is chunk 2", "This is chunk 3"]
  246. response = client.documents.create(chunks=chunks, ingestion_mode="fast")
  247. assert response is not None
  248. assert "results" in response
  249. assert "document_id" in response["results"]
  250. client.documents.delete(id=response["results"]["document_id"])
  251. def test_metadata_handling(client: R2RClient):
  252. """Test ingestion with metadata."""
  253. metadata = {
  254. "title": "Test Document",
  255. "author": "Test Author",
  256. "custom_field": "custom_value",
  257. }
  258. try:
  259. doc_id = file_ingestion(
  260. client=client,
  261. file_path="core/examples/supported_file_types/pdf.pdf",
  262. ingestion_mode="fast",
  263. metadata=metadata,
  264. cleanup=False,
  265. wait_for_completion=True,
  266. )
  267. # Update metadata with server assigned version
  268. metadata["version"] = "v0"
  269. # Verify metadata
  270. doc = client.documents.retrieve(id=doc_id)
  271. assert doc["results"]["metadata"] == metadata
  272. # Cleanup
  273. client.documents.delete(id=doc_id)
  274. except Exception as e:
  275. raise