test_documents.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386
  1. import time
  2. import uuid
  3. import pytest
  4. from r2r import R2RClient, R2RException
  5. @pytest.fixture(scope="session")
  6. def config():
  7. class TestConfig:
  8. base_url = "http://localhost:7272"
  9. superuser_email = "admin@example.com"
  10. superuser_password = "change_me_immediately"
  11. return TestConfig()
  12. @pytest.fixture(scope="session")
  13. def client(config):
  14. """Create a client instance and log in as a superuser."""
  15. client = R2RClient(config.base_url)
  16. client.users.login(config.superuser_email, config.superuser_password)
  17. return client
  18. @pytest.fixture
  19. def test_document(client):
  20. """Create and yield a test document, then clean up."""
  21. resp = client.documents.create(
  22. raw_text="Temporary doc", run_with_orchestration=False
  23. )
  24. document_id = resp["results"]["document_id"]
  25. yield document_id
  26. # Cleanup
  27. try:
  28. client.documents.delete(id=document_id)
  29. except R2RException:
  30. pass
  31. def test_create_document_with_file(client):
  32. resp = client.documents.create(
  33. file_path="core/examples/data/aristotle.txt",
  34. run_with_orchestration=False,
  35. )["results"]
  36. assert (
  37. "document_id" in resp and resp["document_id"]
  38. ), "No document_id returned after file ingestion"
  39. # Cleanup
  40. client.documents.delete(id=resp["document_id"])
  41. def test_create_document_with_raw_text(client):
  42. resp = client.documents.create(
  43. raw_text="This is raw text content.", run_with_orchestration=False
  44. )["results"]
  45. doc_id = resp["document_id"]
  46. assert doc_id, "No document_id returned after raw text ingestion"
  47. # Verify retrieval
  48. retrieved = client.documents.retrieve(id=doc_id)["results"]
  49. assert (
  50. retrieved["id"] == doc_id
  51. ), "Failed to retrieve the ingested raw text document"
  52. # Cleanup
  53. client.documents.delete(id=doc_id)
  54. def test_create_document_with_chunks(client):
  55. resp = client.documents.create(
  56. chunks=["Chunk one", "Chunk two"], run_with_orchestration=False
  57. )["results"]
  58. doc_id = resp["document_id"]
  59. assert doc_id, "No document_id returned after chunk ingestion"
  60. retrieved = client.documents.retrieve(id=doc_id)["results"]
  61. assert (
  62. retrieved["id"] == doc_id
  63. ), "Failed to retrieve the chunk-based document"
  64. # Cleanup
  65. client.documents.delete(id=doc_id)
  66. def test_create_document_different_modes(client):
  67. # hi-res mode
  68. hi_res_resp = client.documents.create(
  69. raw_text="High resolution doc.",
  70. ingestion_mode="hi-res",
  71. run_with_orchestration=False,
  72. )["results"]
  73. hi_res_id = hi_res_resp["document_id"]
  74. assert hi_res_id, "No doc_id returned for hi-res ingestion"
  75. client.documents.delete(id=hi_res_id)
  76. # fast mode
  77. fast_resp = client.documents.create(
  78. raw_text="Fast mode doc.",
  79. ingestion_mode="fast",
  80. run_with_orchestration=False,
  81. )["results"]
  82. fast_id = fast_resp["document_id"]
  83. assert fast_id, "No doc_id returned for fast ingestion"
  84. client.documents.delete(id=fast_id)
  85. def test_list_documents(client, test_document):
  86. listed = client.documents.list(offset=0, limit=10)
  87. results = listed["results"]
  88. assert isinstance(results, list), "Documents list response is not a list"
  89. assert len(results) >= 1, "Expected at least one document"
  90. # test_document is created for this test, so we expect at least that one present.
  91. def test_retrieve_document(client, test_document):
  92. retrieved = client.documents.retrieve(id=test_document)["results"]
  93. assert retrieved["id"] == test_document, "Retrieved wrong document"
  94. def test_download_document(client, test_document):
  95. # For text-only documents, the endpoint returns text as a buffer
  96. content = client.documents.download(id=test_document)
  97. assert content, "Failed to download document content"
  98. data = content.getvalue()
  99. assert len(data) > 0, "Document content is empty"
  100. def test_delete_document(client):
  101. # Create a doc to delete
  102. resp = client.documents.create(
  103. raw_text="This is a temporary doc", run_with_orchestration=False
  104. )["results"]
  105. doc_id = resp["document_id"]
  106. del_resp = client.documents.delete(id=doc_id)["results"]
  107. assert del_resp["success"], "Failed to delete document"
  108. # Verify it's gone
  109. with pytest.raises(R2RException) as exc_info:
  110. client.documents.retrieve(id=doc_id)
  111. assert exc_info.value.status_code == 404, "Expected 404 after deletion"
  112. def test_delete_document_by_filter(client):
  113. # Create a doc with unique metadata
  114. resp = client.documents.create(
  115. raw_text="Document to be filtered out",
  116. metadata={"to_delete": "yes"},
  117. run_with_orchestration=False,
  118. )["results"]
  119. doc_id = resp["document_id"]
  120. filters = {"to_delete": {"$eq": "yes"}}
  121. del_resp = client.documents.delete_by_filter(filters)["results"]
  122. assert del_resp["success"], "Failed to delete documents by filter"
  123. # Verify deletion
  124. with pytest.raises(R2RException) as exc_info:
  125. client.documents.retrieve(id=doc_id)
  126. assert (
  127. exc_info.value.status_code == 404
  128. ), "Document still exists after filter-based deletion"
  129. # @pytest.mark.skip(reason="Only if superuser-specific logic is implemented")
  130. def test_list_document_collections(client, test_document):
  131. # This test assumes the currently logged in user is a superuser
  132. collections = client.documents.list_collections(id=test_document)[
  133. "results"
  134. ]
  135. assert isinstance(
  136. collections, list
  137. ), "Document collections list is not a list"
  138. # @pytest.mark.skip(
  139. # reason="Requires actual entity extraction logic implemented and superuser access"
  140. # )
  141. def test_extract_document(client, test_document):
  142. run_resp = client.documents.extract(
  143. id=test_document, run_type="run", run_with_orchestration=False
  144. )["results"]
  145. assert "message" in run_resp, "No message after extraction run"
  146. # @pytest.mark.skip(reason="Requires entity extraction results present")
  147. def test_list_entities(client, test_document):
  148. # If no entities extracted yet, this could raise an exception
  149. try:
  150. entities = client.documents.list_entities(id=test_document)["results"]
  151. assert isinstance(entities, list), "Entities response not a list"
  152. except R2RException as e:
  153. # Possibly no entities extracted yet
  154. pytest.skip(f"No entities extracted yet: {str(e)}")
  155. # @pytest.mark.skip(reason="Requires relationship extraction results present")
  156. def test_list_relationships(client, test_document):
  157. try:
  158. relationships = client.documents.list_relationships(id=test_document)[
  159. "results"
  160. ]
  161. assert isinstance(
  162. relationships, list
  163. ), "Relationships response not a list"
  164. except R2RException as e:
  165. pytest.skip(f"No relationships extracted yet: {str(e)}")
  166. def test_search_documents(client, test_document):
  167. # Add some delay if indexing takes time
  168. time.sleep(1)
  169. query = "Temporary"
  170. search_results = client.documents.search(
  171. query=query, search_mode="custom", search_settings={"limit": 5}
  172. )
  173. assert "results" in search_results, "Search results key not found"
  174. # We cannot guarantee a match, but at least we got a well-formed response
  175. assert isinstance(
  176. search_results["results"], list
  177. ), "Search results not a list"
  178. def test_list_document_chunks(client):
  179. resp = client.documents.create(
  180. chunks=["C1", "C2", "C3"], run_with_orchestration=False
  181. )["results"]
  182. doc_id = resp["document_id"]
  183. chunks_resp = client.documents.list_chunks(id=doc_id)
  184. results = chunks_resp["results"]
  185. assert len(results) == 3, "Expected 3 chunks"
  186. client.documents.delete(id=doc_id)
  187. def test_search_documents_extended(client):
  188. doc_id = client.documents.create(
  189. raw_text="Aristotle was a Greek philosopher.",
  190. run_with_orchestration=False,
  191. )["results"]["document_id"]
  192. time.sleep(1) # If indexing is asynchronous
  193. search_results = client.documents.search(
  194. query="Greek philosopher",
  195. search_mode="basic",
  196. search_settings={"limit": 1},
  197. )
  198. assert "results" in search_results, "No results key in search response"
  199. assert len(search_results["results"]) > 0, "No documents found"
  200. client.documents.delete(id=doc_id)
  201. def test_retrieve_document_not_found(client):
  202. bad_id = str(uuid.uuid4())
  203. with pytest.raises(R2RException) as exc_info:
  204. client.documents.retrieve(id=bad_id)
  205. assert exc_info.value.status_code == 404, "Wrong error code for not found"
  206. def test_delete_document_non_existent(client):
  207. bad_id = str(uuid.uuid4())
  208. with pytest.raises(R2RException) as exc_info:
  209. client.documents.delete(id=bad_id)
  210. assert (
  211. exc_info.value.status_code == 404
  212. ), "Wrong error code for delete non-existent"
  213. # @pytest.mark.skip(reason="If your API restricts this endpoint to superusers")
  214. def test_get_document_collections_non_superuser(client):
  215. # Create a non-superuser client
  216. non_super_client = R2RClient(client.base_url)
  217. random_string = str(uuid.uuid4())
  218. non_super_client.users.register(f"{random_string}@me.com", "password")
  219. non_super_client.users.login(f"{random_string}@me.com", "password")
  220. document_id = str(uuid.uuid4()) # Some doc ID
  221. with pytest.raises(R2RException) as exc_info:
  222. non_super_client.documents.list_collections(id=document_id)
  223. assert (
  224. exc_info.value.status_code == 403
  225. ), "Expected 403 for non-superuser collections access"
  226. def test_access_document_not_owned(client):
  227. # Create a doc as superuser
  228. doc_id = client.documents.create(
  229. raw_text="Owner doc test", run_with_orchestration=False
  230. )["results"]["document_id"]
  231. # Now try to access with a non-superuser
  232. non_super_client = R2RClient(client.base_url)
  233. random_string = str(uuid.uuid4())
  234. non_super_client.users.register(f"{random_string}@me.com", "password")
  235. non_super_client.users.login(f"{random_string}@me.com", "password")
  236. with pytest.raises(R2RException) as exc_info:
  237. non_super_client.documents.download(id=doc_id)
  238. assert (
  239. exc_info.value.status_code == 403
  240. ), "Wrong error code for unauthorized access"
  241. # Cleanup
  242. client.documents.delete(id=doc_id)
  243. def test_list_documents_with_pagination(client):
  244. doc_ids = []
  245. for i in range(5):
  246. resp = client.documents.create(
  247. raw_text=f"Doc {i}", run_with_orchestration=False
  248. )["results"]
  249. doc_ids.append(resp["document_id"])
  250. listed = client.documents.list(limit=2, offset=0)
  251. results = listed["results"]
  252. assert len(results) == 2, "Expected 2 results for paginated listing"
  253. # Cleanup
  254. for d in doc_ids:
  255. client.documents.delete(id=d)
  256. def test_ingest_invalid_chunks(client):
  257. invalid_chunks = ["Valid chunk", 12345, {"not": "a string"}]
  258. with pytest.raises(R2RException) as exc_info:
  259. client.documents.create(
  260. chunks=invalid_chunks, run_with_orchestration=False
  261. )
  262. assert exc_info.value.status_code in [
  263. 400,
  264. 422,
  265. ], "Expected validation error for invalid chunks"
  266. def test_ingest_too_many_chunks(client):
  267. excessive_chunks = ["Chunk"] * (1024 * 100 + 1) # Just over the limit
  268. with pytest.raises(R2RException) as exc_info:
  269. client.documents.create(
  270. chunks=excessive_chunks, run_with_orchestration=False
  271. )
  272. assert (
  273. exc_info.value.status_code == 400
  274. ), "Wrong error code for exceeding max chunks"
  275. def test_delete_by_complex_filter(client):
  276. doc1 = client.documents.create(
  277. raw_text="Doc with tag A",
  278. metadata={"tag": "A"},
  279. run_with_orchestration=False,
  280. )["results"]["document_id"]
  281. doc2 = client.documents.create(
  282. raw_text="Doc with tag B",
  283. metadata={"tag": "B"},
  284. run_with_orchestration=False,
  285. )["results"]["document_id"]
  286. filters = {"$or": [{"tag": {"$eq": "A"}}, {"tag": {"$eq": "B"}}]}
  287. del_resp = client.documents.delete_by_filter(filters)["results"]
  288. assert del_resp["success"], "Complex filter deletion failed"
  289. # Verify both documents are deleted
  290. for d_id in [doc1, doc2]:
  291. with pytest.raises(R2RException) as exc_info:
  292. client.documents.retrieve(d_id)
  293. assert (
  294. exc_info.value.status_code == 404
  295. ), f"Document {d_id} still exists after deletion"
  296. def test_search_documents_no_match(client):
  297. doc_id = client.documents.create(
  298. raw_text="Just a random document",
  299. metadata={"category": "unrelated"},
  300. run_with_orchestration=False,
  301. )["results"]["document_id"]
  302. # Search for non-existent category
  303. search_results = client.documents.search(
  304. query="nonexistent category",
  305. search_mode="basic",
  306. search_settings={
  307. "filters": {"category": {"$eq": "doesnotexist"}},
  308. "limit": 10,
  309. },
  310. )
  311. assert "results" in search_results, "Search missing results key"
  312. assert len(search_results["results"]) == 0, "Expected zero results"
  313. # Cleanup
  314. client.documents.delete(id=doc_id)