test_documents.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. import time
  2. import uuid
  3. import pytest
  4. from r2r import R2RClient, R2RException
  5. @pytest.fixture
  6. def cleanup_documents(client: R2RClient):
  7. doc_ids = []
  8. def _track_document(doc_id):
  9. doc_ids.append(doc_id)
  10. return doc_id
  11. yield _track_document
  12. # Cleanup all documents
  13. for doc_id in doc_ids:
  14. try:
  15. client.documents.delete(id=doc_id)
  16. except R2RException:
  17. pass
  18. def test_create_document_with_file(client: R2RClient, cleanup_documents):
  19. results = client.documents.create(
  20. file_path="core/examples/data/aristotle.txt",
  21. run_with_orchestration=False,
  22. ).results
  23. doc_id = cleanup_documents(results.document_id)
  24. assert results.document_id, "No document_id returned after file ingestion"
  25. def test_create_document_with_raw_text(client: R2RClient, cleanup_documents):
  26. resp = client.documents.create(raw_text="This is raw text content.",
  27. run_with_orchestration=False)
  28. results = resp.results
  29. doc_id = cleanup_documents(results.document_id)
  30. assert doc_id, "No document_id returned after raw text ingestion"
  31. # Verify retrieval
  32. retrieved = client.documents.retrieve(id=doc_id)
  33. retrieved_results = retrieved.results
  34. assert retrieved_results.id == doc_id, (
  35. "Failed to retrieve the ingested raw text document")
  36. def test_create_document_with_chunks(client: R2RClient, cleanup_documents):
  37. suffix = str(uuid.uuid4())[:8]
  38. resp = client.documents.create(
  39. chunks=[f"Chunk one{suffix}", f"Chunk two{suffix}"],
  40. run_with_orchestration=False,
  41. )
  42. results = resp.results
  43. doc_id = cleanup_documents(results.document_id)
  44. assert doc_id, "No document_id returned after chunk ingestion"
  45. retrieved = client.documents.retrieve(id=doc_id)
  46. retrieved_results = retrieved.results
  47. assert retrieved_results.id == doc_id, (
  48. "Failed to retrieve the chunk-based document")
  49. def test_create_document_different_modes(client: R2RClient, cleanup_documents):
  50. # hi-res mode
  51. hi_res_resp = client.documents.create(
  52. raw_text="High resolution doc.",
  53. ingestion_mode="hi-res",
  54. run_with_orchestration=False,
  55. ).results
  56. hi_res_id = cleanup_documents(hi_res_resp.document_id)
  57. assert hi_res_id, "No doc_id returned for hi-res ingestion"
  58. # fast mode
  59. fast_resp = client.documents.create(
  60. raw_text="Fast mode doc.",
  61. ingestion_mode="fast",
  62. run_with_orchestration=False,
  63. ).results
  64. fast_id = cleanup_documents(fast_resp.document_id)
  65. assert fast_id, "No doc_id returned for fast ingestion"
  66. def test_list_documents(client: R2RClient, test_document):
  67. results = client.documents.list(offset=0, limit=10).results
  68. assert isinstance(results, list), "Documents list response is not a list"
  69. assert len(results) >= 1, "Expected at least one document"
  70. # test_document is created for this test, so we expect at least that one present.
  71. def test_retrieve_document(client: R2RClient, test_document):
  72. retrieved = client.documents.retrieve(id=test_document).results
  73. assert retrieved.id == test_document, "Retrieved wrong document"
  74. def test_download_document(client: R2RClient, test_document):
  75. # For text-only documents, the endpoint returns text as a buffer
  76. content = client.documents.download(id=test_document)
  77. assert content, "Failed to download document content"
  78. data = content.getvalue()
  79. assert len(data) > 0, "Document content is empty"
  80. def test_delete_document(client: R2RClient):
  81. # Create a doc to delete
  82. resp = client.documents.create(raw_text="This is a temporary doc",
  83. run_with_orchestration=False).results
  84. doc_id = resp.document_id
  85. del_resp = client.documents.delete(id=doc_id).results
  86. assert del_resp.success, "Failed to delete document"
  87. # Verify it's gone
  88. with pytest.raises(R2RException) as exc_info:
  89. client.documents.retrieve(id=doc_id)
  90. assert exc_info.value.status_code == 404, "Expected 404 after deletion"
  91. def test_delete_document_by_filter(client: R2RClient):
  92. # Create a doc with unique metadata
  93. resp = client.documents.create(
  94. raw_text="Document to be filtered out",
  95. metadata={
  96. "to_delete": "yes"
  97. },
  98. run_with_orchestration=False,
  99. ).results
  100. doc_id = resp.document_id
  101. filters = {"to_delete": {"$eq": "yes"}}
  102. del_resp = client.documents.delete_by_filter(filters).results
  103. assert del_resp.success, "Failed to delete documents by filter"
  104. # Verify deletion
  105. with pytest.raises(R2RException) as exc_info:
  106. client.documents.retrieve(id=doc_id)
  107. assert exc_info.value.status_code == 404, (
  108. "Document still exists after filter-based deletion")
  109. # @pytest.mark.skip(reason="Only if superuser-specific logic is implemented")
  110. def test_list_document_collections(client: R2RClient, test_document):
  111. # This test assumes the currently logged in user is a superuser
  112. collections = client.documents.list_collections(id=test_document).results
  113. assert isinstance(collections,
  114. list), ("Document collections list is not a list")
  115. # @pytest.mark.skip(
  116. # reason="Requires actual entity extraction logic implemented and superuser access"
  117. # )
  118. def test_extract_document(client: R2RClient, test_document):
  119. time.sleep(10)
  120. run_resp = client.documents.extract(id=test_document,
  121. run_with_orchestration=False).results
  122. assert run_resp.message is not None, "No message after extraction run"
  123. # @pytest.mark.skip(reason="Requires entity extraction results present")
  124. def test_list_entities(client: R2RClient, test_document):
  125. # If no entities extracted yet, this could raise an exception
  126. try:
  127. entities = client.documents.list_entities(id=test_document).results
  128. assert isinstance(entities, list), "Entities response not a list"
  129. except R2RException as e:
  130. # Possibly no entities extracted yet
  131. pytest.skip(f"No entities extracted yet: {str(e)}")
  132. # @pytest.mark.skip(reason="Requires relationship extraction results present")
  133. def test_list_relationships(client: R2RClient, test_document):
  134. try:
  135. relationships = client.documents.list_relationships(
  136. id=test_document).results
  137. assert isinstance(relationships,
  138. list), ("Relationships response not a list")
  139. except R2RException as e:
  140. pytest.skip(f"No relationships extracted yet: {str(e)}")
  141. def test_search_documents(client: R2RClient, test_document):
  142. # Add some delay if indexing takes time
  143. time.sleep(1)
  144. query = "Temporary"
  145. search_results = client.documents.search(query=query,
  146. search_mode="custom",
  147. search_settings={"limit": 5})
  148. assert search_results.results is not None, "Search results key not found"
  149. # We cannot guarantee a match, but at least we got a well-formed response
  150. assert isinstance(search_results.results,
  151. list), ("Search results not a list")
  152. def test_list_document_chunks(mutable_client: R2RClient, cleanup_documents):
  153. temp_user = f"{uuid.uuid4()}@me.com"
  154. mutable_client.users.create(temp_user, "password")
  155. mutable_client.users.login(temp_user, "password")
  156. resp = mutable_client.documents.create(
  157. chunks=["C1", "C2", "C3"], run_with_orchestration=False).results
  158. doc_id = cleanup_documents(resp.document_id)
  159. chunks_resp = mutable_client.documents.list_chunks(id=doc_id)
  160. results = chunks_resp.results
  161. assert len(results) == 3, "Expected 3 chunks"
  162. mutable_client.users.logout()
  163. def test_search_documents_extended(client: R2RClient, cleanup_documents):
  164. doc_id = cleanup_documents(
  165. client.documents.create(
  166. raw_text="Aristotle was a Greek philosopher.",
  167. run_with_orchestration=False,
  168. ).results.document_id)
  169. time.sleep(1) # If indexing is asynchronous
  170. search_results = client.documents.search(
  171. query="Greek philosopher",
  172. search_mode="basic",
  173. search_settings={"limit": 1},
  174. )
  175. assert search_results.results is not None, (
  176. "No results key in search response")
  177. assert len(search_results.results) > 0, "No documents found"
  178. def test_retrieve_document_not_found(client):
  179. bad_id = str(uuid.uuid4())
  180. with pytest.raises(R2RException) as exc_info:
  181. client.documents.retrieve(id=bad_id)
  182. assert exc_info.value.status_code == 404, "Wrong error code for not found"
  183. def test_delete_document_non_existent(client):
  184. bad_id = str(uuid.uuid4())
  185. with pytest.raises(R2RException) as exc_info:
  186. client.documents.delete(id=bad_id)
  187. assert exc_info.value.status_code == 404, (
  188. "Wrong error code for delete non-existent")
  189. # @pytest.mark.skip(reason="If your API restricts this endpoint to superusers")
  190. def test_get_document_collections_non_superuser(client):
  191. # Create a non-superuser client
  192. non_super_client = R2RClient(client.base_url)
  193. random_string = str(uuid.uuid4())
  194. non_super_client.users.create(f"{random_string}@me.com", "password")
  195. non_super_client.users.login(f"{random_string}@me.com", "password")
  196. document_id = str(uuid.uuid4()) # Some doc ID
  197. with pytest.raises(R2RException) as exc_info:
  198. non_super_client.documents.list_collections(id=document_id)
  199. assert exc_info.value.status_code == 403, (
  200. "Expected 403 for non-superuser collections access")
  201. def test_access_document_not_owned(client: R2RClient, cleanup_documents):
  202. # Create a doc as superuser
  203. doc_id = cleanup_documents(
  204. client.documents.create(
  205. raw_text="Owner doc test",
  206. run_with_orchestration=False).results.document_id)
  207. # Now try to access with a non-superuser
  208. non_super_client = R2RClient(client.base_url)
  209. random_string = str(uuid.uuid4())
  210. non_super_client.users.create(f"{random_string}@me.com", "password")
  211. non_super_client.users.login(f"{random_string}@me.com", "password")
  212. with pytest.raises(R2RException) as exc_info:
  213. non_super_client.documents.download(id=doc_id)
  214. assert exc_info.value.status_code == 403, (
  215. "Wrong error code for unauthorized access")
  216. def test_list_documents_with_pagination(mutable_client: R2RClient,
  217. cleanup_documents):
  218. temp_user = f"{uuid.uuid4()}@me.com"
  219. mutable_client.users.create(temp_user, "password")
  220. mutable_client.users.login(temp_user, "password")
  221. for i in range(3):
  222. cleanup_documents(
  223. mutable_client.documents.create(
  224. raw_text=f"Doc {i}",
  225. run_with_orchestration=False).results.document_id)
  226. listed = mutable_client.documents.list(limit=2, offset=0)
  227. results = listed.results
  228. assert len(results) == 2, "Expected 2 results for paginated listing"
  229. def test_ingest_invalid_chunks(client):
  230. invalid_chunks = ["Valid chunk", 12345, {"not": "a string"}]
  231. with pytest.raises(R2RException) as exc_info:
  232. client.documents.create(chunks=invalid_chunks,
  233. run_with_orchestration=False)
  234. assert exc_info.value.status_code in [
  235. 400,
  236. 422,
  237. ], "Expected validation error for invalid chunks"
  238. def test_ingest_too_many_chunks(client: R2RClient):
  239. excessive_chunks = ["Chunk"] * (1024 * 100 + 1) # Just over the limit
  240. with pytest.raises(R2RException) as exc_info:
  241. client.documents.create(chunks=excessive_chunks,
  242. run_with_orchestration=False)
  243. assert exc_info.value.status_code == 400, (
  244. "Wrong error code for exceeding max chunks")
  245. def test_chunk_size_and_overlap(client: R2RClient, cleanup_documents):
  246. test_text = "This is a test document with chunk size and overlap settings that we want to verify."
  247. document_id = cleanup_documents(
  248. client.documents.create(
  249. raw_text=test_text,
  250. ingestion_config={
  251. "chunk_size": 10,
  252. "chunk_overlap": 2,
  253. },
  254. run_with_orchestration=False
  255. ).results.document_id
  256. )
  257. time.sleep(1)
  258. chunks = client.documents.list_chunks(id=document_id).results
  259. assert len(chunks) > 0, "No chunks were created"
  260. # Verify each chunk respects the maximum size
  261. for chunk in chunks:
  262. assert len(chunk.text) <= 10, f"Chunk exceeds maximum size: '{chunk.text}'"
  263. long_text = "Here is a longer document that we can use to test larger chunk sizes and overlaps to ensure the chunking algorithm works properly across different configurations."
  264. document_id2 = cleanup_documents(
  265. client.documents.create(
  266. raw_text=long_text,
  267. ingestion_config={
  268. "chunk_size": 20,
  269. "chunk_overlap": 5,
  270. },
  271. run_with_orchestration=False
  272. ).results.document_id
  273. )
  274. chunks2 = client.documents.list_chunks(id=document_id2).results
  275. assert len(chunks2) > 0, "No chunks were created for the second document"
  276. for chunk in chunks2:
  277. assert len(chunk.text) <= 20, f"Chunk exceeds maximum size: '{chunk.text}'"
  278. def test_delete_by_complex_filter(client: R2RClient, cleanup_documents):
  279. doc1 = cleanup_documents(
  280. client.documents.create(
  281. raw_text="Doc with tag A",
  282. metadata={
  283. "tag": "A"
  284. },
  285. run_with_orchestration=False,
  286. ).results.document_id)
  287. doc2 = cleanup_documents(
  288. client.documents.create(
  289. raw_text="Doc with tag B",
  290. metadata={
  291. "tag": "B"
  292. },
  293. run_with_orchestration=False,
  294. ).results.document_id)
  295. filters = {"$or": [{"tag": {"$eq": "A"}}, {"tag": {"$eq": "B"}}]}
  296. del_resp = client.documents.delete_by_filter(filters).results
  297. assert del_resp.success, "Complex filter deletion failed"
  298. # Verify both documents are deleted
  299. for d_id in [doc1, doc2]:
  300. with pytest.raises(R2RException) as exc_info:
  301. client.documents.retrieve(d_id)
  302. assert exc_info.value.status_code == 404, (
  303. f"Document {d_id} still exists after deletion")
  304. def test_search_documents_no_match(client: R2RClient, cleanup_documents):
  305. doc_id = cleanup_documents(
  306. client.documents.create(
  307. raw_text="Just a random document",
  308. metadata={
  309. "category": "unrelated"
  310. },
  311. run_with_orchestration=False,
  312. ).results.document_id)
  313. # Search for non-existent category
  314. search_results = client.documents.search(
  315. query="nonexistent category",
  316. search_mode="basic",
  317. search_settings={
  318. "filters": {
  319. "category": {
  320. "$eq": "doesnotexist"
  321. }
  322. },
  323. "limit": 10,
  324. },
  325. )
  326. assert search_results.results is not None, "Search missing results key"
  327. assert len(search_results.results) == 0, "Expected zero results"
  328. import pytest
  329. def test_delete_by_workflow_metadata(client: R2RClient, cleanup_documents):
  330. """Test deletion by workflow state metadata."""
  331. # Create test documents with workflow metadata
  332. random_suffix = uuid.uuid4()
  333. docs = []
  334. try:
  335. docs.append(
  336. cleanup_documents(
  337. client.documents.create(
  338. raw_text="Draft document 1" + str(random_suffix),
  339. metadata={
  340. "workflow": {
  341. "state": "draft",
  342. "assignee": "user1",
  343. "review_count": 0,
  344. }
  345. },
  346. run_with_orchestration=False,
  347. ).results.document_id))
  348. docs.append(
  349. cleanup_documents(
  350. client.documents.create(
  351. raw_text="Draft document 2" + str(random_suffix),
  352. metadata={
  353. "workflow": {
  354. "state": "draft",
  355. "assignee": "user2",
  356. "review_count": 1,
  357. }
  358. },
  359. run_with_orchestration=False,
  360. ).results.document_id))
  361. docs.append(
  362. cleanup_documents(
  363. client.documents.create(
  364. raw_text="Published document" + str(random_suffix),
  365. metadata={
  366. "workflow": {
  367. "state": "published",
  368. "assignee": "user1",
  369. "review_count": 2,
  370. }
  371. },
  372. run_with_orchestration=False,
  373. ).results.document_id))
  374. # Delete drafts with no reviews
  375. filters = {
  376. "$and": [
  377. {
  378. "metadata.workflow.state": {
  379. "$eq": "draft"
  380. }
  381. },
  382. {
  383. "metadata.workflow.review_count": {
  384. "$eq": 0
  385. }
  386. },
  387. ]
  388. }
  389. response = client.documents.delete_by_filter(filters).results
  390. assert response.success
  391. # Verify first draft is deleted
  392. with pytest.raises(R2RException) as exc:
  393. client.documents.retrieve(id=docs[0])
  394. assert exc.value.status_code == 404
  395. # Verify other documents still exist
  396. assert client.documents.retrieve(id=docs[1])
  397. assert client.documents.retrieve(id=docs[2])
  398. except Exception:
  399. raise
  400. def test_delete_by_classification_metadata(client: R2RClient,
  401. cleanup_documents):
  402. """Test deletion by document classification metadata."""
  403. docs = []
  404. try:
  405. docs.append(
  406. cleanup_documents(
  407. client.documents.create(
  408. raw_text="Confidential document",
  409. metadata={
  410. "classification": {
  411. "level": "confidential",
  412. "department": "HR",
  413. "retention_years": 7,
  414. }
  415. },
  416. run_with_orchestration=False,
  417. ).results.document_id))
  418. docs.append(
  419. cleanup_documents(
  420. client.documents.create(
  421. raw_text="Public document",
  422. metadata={
  423. "classification": {
  424. "level": "public",
  425. "department": "Marketing",
  426. "retention_years": 1,
  427. }
  428. },
  429. run_with_orchestration=False,
  430. ).results.document_id))
  431. # Delete HR documents with high retention
  432. filters = {
  433. "$and": [
  434. {
  435. "classification.department": {
  436. "$eq": "HR"
  437. }
  438. },
  439. {
  440. "classification.retention_years": {
  441. "$gt": 5
  442. }
  443. },
  444. ]
  445. }
  446. response = client.documents.delete_by_filter(filters).results
  447. assert response.success
  448. # Verify confidential HR doc is deleted
  449. with pytest.raises(R2RException) as exc:
  450. client.documents.retrieve(id=docs[0])
  451. assert exc.value.status_code == 404
  452. # Verify public doc still exists
  453. assert client.documents.retrieve(id=docs[1])
  454. except Exception:
  455. raise
  456. def test_delete_by_version_metadata(client: R2RClient, cleanup_documents):
  457. """Test deletion by version and status metadata with array conditions."""
  458. suffix = uuid.uuid4()
  459. docs = []
  460. try:
  461. docs.append(
  462. cleanup_documents(
  463. client.documents.create(
  464. raw_text="Old version document" + str(suffix),
  465. metadata={
  466. "version_info": {
  467. "number": "1.0.0",
  468. "status": "deprecated",
  469. "tags": ["legacy", "unsupported"],
  470. },
  471. },
  472. run_with_orchestration=False,
  473. ).results.document_id))
  474. docs.append(
  475. cleanup_documents(
  476. client.documents.create(
  477. raw_text="Current version document" + str(suffix),
  478. metadata={
  479. "version_info": {
  480. "number": "2.0.0",
  481. "status": "current",
  482. "tags": ["stable", "supported"],
  483. },
  484. },
  485. run_with_orchestration=False,
  486. ).results.document_id))
  487. # Delete deprecated documents with legacy tag
  488. filters = {
  489. "$and": [
  490. {
  491. "metadata.version_info.status": {
  492. "$eq": "deprecated"
  493. }
  494. },
  495. {
  496. "metadata.version_info.tags": {
  497. "$in": ["legacy"]
  498. }
  499. },
  500. ]
  501. }
  502. response = client.documents.delete_by_filter(filters).results
  503. assert response.success
  504. # Verify deprecated doc is deleted
  505. with pytest.raises(R2RException) as exc:
  506. doc = client.documents.retrieve(id=docs[0])
  507. print('doc = ', doc)
  508. assert exc.value.status_code == 404
  509. # Verify current doc still exists
  510. assert client.documents.retrieve(id=docs[1])
  511. except Exception:
  512. raise