test_documents.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. import time
  2. import uuid
  3. import pytest
  4. from r2r import R2RClient, R2RException
  5. def test_create_document_with_file(client):
  6. resp = client.documents.create(
  7. file_path="core/examples/data/aristotle.txt",
  8. run_with_orchestration=False,
  9. )["results"]
  10. assert (
  11. "document_id" in resp and resp["document_id"]
  12. ), "No document_id returned after file ingestion"
  13. # Cleanup
  14. client.documents.delete(id=resp["document_id"])
  15. def test_create_document_with_raw_text(client):
  16. resp = client.documents.create(
  17. raw_text="This is raw text content.", run_with_orchestration=False
  18. )["results"]
  19. doc_id = resp["document_id"]
  20. assert doc_id, "No document_id returned after raw text ingestion"
  21. # Verify retrieval
  22. retrieved = client.documents.retrieve(id=doc_id)["results"]
  23. assert (
  24. retrieved["id"] == doc_id
  25. ), "Failed to retrieve the ingested raw text document"
  26. # Cleanup
  27. client.documents.delete(id=doc_id)
  28. def test_create_document_with_chunks(client):
  29. suffix = str(uuid.uuid4())[:8]
  30. resp = client.documents.create(
  31. chunks=["Chunk one" + suffix, "Chunk two" + suffix],
  32. run_with_orchestration=False,
  33. )["results"]
  34. doc_id = resp["document_id"]
  35. assert doc_id, "No document_id returned after chunk ingestion"
  36. retrieved = client.documents.retrieve(id=doc_id)["results"]
  37. assert (
  38. retrieved["id"] == doc_id
  39. ), "Failed to retrieve the chunk-based document"
  40. # Cleanup
  41. client.documents.delete(id=doc_id)
  42. def test_create_document_different_modes(client):
  43. # hi-res mode
  44. hi_res_resp = client.documents.create(
  45. raw_text="High resolution doc.",
  46. ingestion_mode="hi-res",
  47. run_with_orchestration=False,
  48. )["results"]
  49. hi_res_id = hi_res_resp["document_id"]
  50. assert hi_res_id, "No doc_id returned for hi-res ingestion"
  51. client.documents.delete(id=hi_res_id)
  52. # fast mode
  53. fast_resp = client.documents.create(
  54. raw_text="Fast mode doc.",
  55. ingestion_mode="fast",
  56. run_with_orchestration=False,
  57. )["results"]
  58. fast_id = fast_resp["document_id"]
  59. assert fast_id, "No doc_id returned for fast ingestion"
  60. client.documents.delete(id=fast_id)
  61. def test_list_documents(client, test_document):
  62. listed = client.documents.list(offset=0, limit=10)
  63. results = listed["results"]
  64. assert isinstance(results, list), "Documents list response is not a list"
  65. assert len(results) >= 1, "Expected at least one document"
  66. # test_document is created for this test, so we expect at least that one present.
  67. def test_retrieve_document(client, test_document):
  68. retrieved = client.documents.retrieve(id=test_document)["results"]
  69. assert retrieved["id"] == test_document, "Retrieved wrong document"
  70. def test_download_document(client, test_document):
  71. # For text-only documents, the endpoint returns text as a buffer
  72. content = client.documents.download(id=test_document)
  73. assert content, "Failed to download document content"
  74. data = content.getvalue()
  75. assert len(data) > 0, "Document content is empty"
  76. def test_delete_document(client):
  77. # Create a doc to delete
  78. resp = client.documents.create(
  79. raw_text="This is a temporary doc", run_with_orchestration=False
  80. )["results"]
  81. doc_id = resp["document_id"]
  82. del_resp = client.documents.delete(id=doc_id)["results"]
  83. assert del_resp["success"], "Failed to delete document"
  84. # Verify it's gone
  85. with pytest.raises(R2RException) as exc_info:
  86. client.documents.retrieve(id=doc_id)
  87. assert exc_info.value.status_code == 404, "Expected 404 after deletion"
  88. def test_delete_document_by_filter(client):
  89. # Create a doc with unique metadata
  90. resp = client.documents.create(
  91. raw_text="Document to be filtered out",
  92. metadata={"to_delete": "yes"},
  93. run_with_orchestration=False,
  94. )["results"]
  95. doc_id = resp["document_id"]
  96. filters = {"to_delete": {"$eq": "yes"}}
  97. del_resp = client.documents.delete_by_filter(filters)["results"]
  98. assert del_resp["success"], "Failed to delete documents by filter"
  99. # Verify deletion
  100. with pytest.raises(R2RException) as exc_info:
  101. client.documents.retrieve(id=doc_id)
  102. assert (
  103. exc_info.value.status_code == 404
  104. ), "Document still exists after filter-based deletion"
  105. # @pytest.mark.skip(reason="Only if superuser-specific logic is implemented")
  106. def test_list_document_collections(client, test_document):
  107. # This test assumes the currently logged in user is a superuser
  108. collections = client.documents.list_collections(id=test_document)[
  109. "results"
  110. ]
  111. assert isinstance(
  112. collections, list
  113. ), "Document collections list is not a list"
  114. # @pytest.mark.skip(
  115. # reason="Requires actual entity extraction logic implemented and superuser access"
  116. # )
  117. def test_extract_document(client, test_document):
  118. time.sleep(10)
  119. run_resp = client.documents.extract(
  120. id=test_document, run_type="run", run_with_orchestration=False
  121. )["results"]
  122. assert "message" in run_resp, "No message after extraction run"
  123. # @pytest.mark.skip(reason="Requires entity extraction results present")
  124. def test_list_entities(client, test_document):
  125. # If no entities extracted yet, this could raise an exception
  126. try:
  127. entities = client.documents.list_entities(id=test_document)["results"]
  128. assert isinstance(entities, list), "Entities response not a list"
  129. except R2RException as e:
  130. # Possibly no entities extracted yet
  131. pytest.skip(f"No entities extracted yet: {str(e)}")
  132. # @pytest.mark.skip(reason="Requires relationship extraction results present")
  133. def test_list_relationships(client, test_document):
  134. try:
  135. relationships = client.documents.list_relationships(id=test_document)[
  136. "results"
  137. ]
  138. assert isinstance(
  139. relationships, list
  140. ), "Relationships response not a list"
  141. except R2RException as e:
  142. pytest.skip(f"No relationships extracted yet: {str(e)}")
  143. def test_search_documents(client, test_document):
  144. # Add some delay if indexing takes time
  145. time.sleep(1)
  146. query = "Temporary"
  147. search_results = client.documents.search(
  148. query=query, search_mode="custom", search_settings={"limit": 5}
  149. )
  150. assert "results" in search_results, "Search results key not found"
  151. # We cannot guarantee a match, but at least we got a well-formed response
  152. assert isinstance(
  153. search_results["results"], list
  154. ), "Search results not a list"
  155. def test_list_document_chunks(mutable_client):
  156. temp_user = f"{uuid.uuid4()}@me.com"
  157. mutable_client.users.register(temp_user, "password")
  158. mutable_client.users.login(temp_user, "password")
  159. resp = mutable_client.documents.create(
  160. chunks=["C1", "C2", "C3"], run_with_orchestration=False
  161. )["results"]
  162. doc_id = resp["document_id"]
  163. chunks_resp = mutable_client.documents.list_chunks(id=doc_id)
  164. results = chunks_resp["results"]
  165. assert len(results) == 3, "Expected 3 chunks"
  166. mutable_client.documents.delete(id=doc_id)
  167. mutable_client.users.logout()
  168. def test_search_documents_extended(client):
  169. doc_id = client.documents.create(
  170. raw_text="Aristotle was a Greek philosopher.",
  171. run_with_orchestration=False,
  172. )["results"]["document_id"]
  173. time.sleep(1) # If indexing is asynchronous
  174. search_results = client.documents.search(
  175. query="Greek philosopher",
  176. search_mode="basic",
  177. search_settings={"limit": 1},
  178. )
  179. assert "results" in search_results, "No results key in search response"
  180. assert len(search_results["results"]) > 0, "No documents found"
  181. client.documents.delete(id=doc_id)
  182. def test_retrieve_document_not_found(client):
  183. bad_id = str(uuid.uuid4())
  184. with pytest.raises(R2RException) as exc_info:
  185. client.documents.retrieve(id=bad_id)
  186. assert exc_info.value.status_code == 404, "Wrong error code for not found"
  187. def test_delete_document_non_existent(client):
  188. bad_id = str(uuid.uuid4())
  189. with pytest.raises(R2RException) as exc_info:
  190. client.documents.delete(id=bad_id)
  191. assert (
  192. exc_info.value.status_code == 404
  193. ), "Wrong error code for delete non-existent"
  194. # @pytest.mark.skip(reason="If your API restricts this endpoint to superusers")
  195. def test_get_document_collections_non_superuser(client):
  196. # Create a non-superuser client
  197. non_super_client = R2RClient(client.base_url)
  198. random_string = str(uuid.uuid4())
  199. non_super_client.users.register(f"{random_string}@me.com", "password")
  200. non_super_client.users.login(f"{random_string}@me.com", "password")
  201. document_id = str(uuid.uuid4()) # Some doc ID
  202. with pytest.raises(R2RException) as exc_info:
  203. non_super_client.documents.list_collections(id=document_id)
  204. assert (
  205. exc_info.value.status_code == 403
  206. ), "Expected 403 for non-superuser collections access"
  207. def test_access_document_not_owned(client):
  208. # Create a doc as superuser
  209. doc_id = client.documents.create(
  210. raw_text="Owner doc test", run_with_orchestration=False
  211. )["results"]["document_id"]
  212. # Now try to access with a non-superuser
  213. non_super_client = R2RClient(client.base_url)
  214. random_string = str(uuid.uuid4())
  215. non_super_client.users.register(f"{random_string}@me.com", "password")
  216. non_super_client.users.login(f"{random_string}@me.com", "password")
  217. with pytest.raises(R2RException) as exc_info:
  218. non_super_client.documents.download(id=doc_id)
  219. assert (
  220. exc_info.value.status_code == 403
  221. ), "Wrong error code for unauthorized access"
  222. # Cleanup
  223. client.documents.delete(id=doc_id)
  224. def test_list_documents_with_pagination(mutable_client):
  225. temp_user = f"{uuid.uuid4()}@me.com"
  226. mutable_client.users.register(temp_user, "password")
  227. mutable_client.users.login(temp_user, "password")
  228. doc_ids = []
  229. for i in range(3):
  230. resp = mutable_client.documents.create(
  231. raw_text=f"Doc {i}", run_with_orchestration=False
  232. )["results"]
  233. doc_ids.append(resp["document_id"])
  234. listed = mutable_client.documents.list(limit=2, offset=0)
  235. results = listed["results"]
  236. assert len(results) == 2, "Expected 2 results for paginated listing"
  237. # Cleanup
  238. for d in doc_ids:
  239. mutable_client.documents.delete(id=d)
  240. def test_ingest_invalid_chunks(client):
  241. invalid_chunks = ["Valid chunk", 12345, {"not": "a string"}]
  242. with pytest.raises(R2RException) as exc_info:
  243. client.documents.create(
  244. chunks=invalid_chunks, run_with_orchestration=False
  245. )
  246. assert exc_info.value.status_code in [
  247. 400,
  248. 422,
  249. ], "Expected validation error for invalid chunks"
  250. def test_ingest_too_many_chunks(client):
  251. excessive_chunks = ["Chunk"] * (1024 * 100 + 1) # Just over the limit
  252. with pytest.raises(R2RException) as exc_info:
  253. client.documents.create(
  254. chunks=excessive_chunks, run_with_orchestration=False
  255. )
  256. assert (
  257. exc_info.value.status_code == 400
  258. ), "Wrong error code for exceeding max chunks"
  259. def test_delete_by_complex_filter(client):
  260. doc1 = client.documents.create(
  261. raw_text="Doc with tag A",
  262. metadata={"tag": "A"},
  263. run_with_orchestration=False,
  264. )["results"]["document_id"]
  265. doc2 = client.documents.create(
  266. raw_text="Doc with tag B",
  267. metadata={"tag": "B"},
  268. run_with_orchestration=False,
  269. )["results"]["document_id"]
  270. filters = {"$or": [{"tag": {"$eq": "A"}}, {"tag": {"$eq": "B"}}]}
  271. del_resp = client.documents.delete_by_filter(filters)["results"]
  272. assert del_resp["success"], "Complex filter deletion failed"
  273. # Verify both documents are deleted
  274. for d_id in [doc1, doc2]:
  275. with pytest.raises(R2RException) as exc_info:
  276. client.documents.retrieve(d_id)
  277. assert (
  278. exc_info.value.status_code == 404
  279. ), f"Document {d_id} still exists after deletion"
  280. def test_search_documents_no_match(client):
  281. doc_id = client.documents.create(
  282. raw_text="Just a random document",
  283. metadata={"category": "unrelated"},
  284. run_with_orchestration=False,
  285. )["results"]["document_id"]
  286. # Search for non-existent category
  287. search_results = client.documents.search(
  288. query="nonexistent category",
  289. search_mode="basic",
  290. search_settings={
  291. "filters": {"category": {"$eq": "doesnotexist"}},
  292. "limit": 10,
  293. },
  294. )
  295. assert "results" in search_results, "Search missing results key"
  296. assert len(search_results["results"]) == 0, "Expected zero results"
  297. # Cleanup
  298. client.documents.delete(id=doc_id)
  299. from datetime import datetime
  300. import pytest
  301. from r2r import R2RException
  302. def test_delete_by_workflow_metadata(client):
  303. """Test deletion by workflow state metadata."""
  304. # Create test documents with workflow metadata
  305. random_suffix = uuid.uuid4()
  306. docs = [
  307. client.documents.create(
  308. raw_text="Draft document 1" + str(random_suffix),
  309. metadata={
  310. "workflow": {
  311. "state": "draft",
  312. "assignee": "user1",
  313. "review_count": 0,
  314. }
  315. },
  316. run_with_orchestration=False,
  317. )["results"]["document_id"],
  318. client.documents.create(
  319. raw_text="Draft document 2" + str(random_suffix),
  320. metadata={
  321. "workflow": {
  322. "state": "draft",
  323. "assignee": "user2",
  324. "review_count": 1,
  325. }
  326. },
  327. run_with_orchestration=False,
  328. )["results"]["document_id"],
  329. client.documents.create(
  330. raw_text="Published document" + str(random_suffix),
  331. metadata={
  332. "workflow": {
  333. "state": "published",
  334. "assignee": "user1",
  335. "review_count": 2,
  336. }
  337. },
  338. run_with_orchestration=False,
  339. )["results"]["document_id"],
  340. ]
  341. try:
  342. # Delete drafts with no reviews
  343. filters = {
  344. "$and": [
  345. {"metadata.workflow.state": {"$eq": "draft"}},
  346. {"metadata.workflow.review_count": {"$eq": 0}},
  347. ]
  348. }
  349. response = client.documents.delete_by_filter(filters)["results"]
  350. assert response["success"]
  351. # Verify first draft is deleted
  352. with pytest.raises(R2RException) as exc:
  353. client.documents.retrieve(id=docs[0])
  354. assert exc.value.status_code == 404
  355. # Verify other documents still exist
  356. assert client.documents.retrieve(id=docs[1])
  357. assert client.documents.retrieve(id=docs[2])
  358. finally:
  359. # Cleanup remaining documents
  360. for doc_id in docs[1:]:
  361. try:
  362. client.documents.delete(id=doc_id)
  363. except R2RException:
  364. pass
  365. def test_delete_by_classification_metadata(client):
  366. """Test deletion by document classification metadata."""
  367. # Create test documents with classification metadata
  368. docs = [
  369. client.documents.create(
  370. raw_text="Confidential document",
  371. metadata={
  372. "classification": {
  373. "level": "confidential",
  374. "department": "HR",
  375. "retention_years": 7,
  376. }
  377. },
  378. run_with_orchestration=False,
  379. )["results"]["document_id"],
  380. client.documents.create(
  381. raw_text="Public document",
  382. metadata={
  383. "classification": {
  384. "level": "public",
  385. "department": "Marketing",
  386. "retention_years": 1,
  387. }
  388. },
  389. run_with_orchestration=False,
  390. )["results"]["document_id"],
  391. ]
  392. try:
  393. # Delete HR documents with high retention
  394. filters = {
  395. "$and": [
  396. {"classification.department": {"$eq": "HR"}},
  397. {"classification.retention_years": {"$gt": 5}},
  398. ]
  399. }
  400. response = client.documents.delete_by_filter(filters)["results"]
  401. assert response["success"]
  402. # Verify confidential HR doc is deleted
  403. with pytest.raises(R2RException) as exc:
  404. client.documents.retrieve(id=docs[0])
  405. assert exc.value.status_code == 404
  406. # Verify public doc still exists
  407. assert client.documents.retrieve(id=docs[1])
  408. finally:
  409. # Cleanup remaining document
  410. try:
  411. client.documents.delete(id=docs[1])
  412. except R2RException:
  413. pass
  414. def test_delete_by_version_metadata(client):
  415. """Test deletion by version and status metadata with array conditions."""
  416. suffix = uuid.uuid4()
  417. docs = [
  418. client.documents.create(
  419. raw_text="Old version document" + str(suffix),
  420. metadata={
  421. "version_info": {
  422. "number": "1.0.0",
  423. "status": "deprecated",
  424. "tags": ["legacy", "unsupported"],
  425. },
  426. },
  427. run_with_orchestration=False,
  428. )["results"]["document_id"],
  429. client.documents.create(
  430. raw_text="Current version document" + str(suffix),
  431. metadata={
  432. "version_info": {
  433. "number": "2.0.0",
  434. "status": "current",
  435. "tags": ["stable", "supported"],
  436. },
  437. },
  438. run_with_orchestration=False,
  439. )["results"]["document_id"],
  440. ]
  441. try:
  442. # Delete deprecated documents with legacy tag
  443. filters = {
  444. "$and": [
  445. {"metadata.version_info.status": {"$eq": "deprecated"}},
  446. # TODO - WHy is `$in` not working for deletion?
  447. {"metadata.version_info.tags": {"$in": ["legacy"]}},
  448. ]
  449. }
  450. response = client.documents.delete_by_filter(filters)["results"]
  451. assert response["success"]
  452. # Verify deprecated doc is deleted
  453. with pytest.raises(R2RException) as exc:
  454. client.documents.retrieve(id=docs[0])
  455. assert exc.value.status_code == 404
  456. # Verify current doc still exists
  457. assert client.documents.retrieve(id=docs[1])
  458. finally:
  459. # Cleanup remaining document
  460. try:
  461. client.documents.delete(id=docs[1])
  462. except R2RException:
  463. pass