test_filters.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. import uuid
  2. import pytest
  3. from r2r import R2RException
  4. @pytest.fixture
  5. def setup_docs_with_collections(client):
  6. # Create some test collections
  7. random_suffix = str(uuid.uuid4())[:8]
  8. coll_ids = []
  9. for i in range(3):
  10. resp = client.collections.create(name=f"TestColl{i}")["results"]
  11. coll_ids.append(resp["id"])
  12. # Create documents with different collection arrangements:
  13. # doc1: [coll1]
  14. doc1 = client.documents.create(
  15. raw_text="Doc in coll1" + random_suffix, run_with_orchestration=False
  16. )["results"]["document_id"]
  17. client.collections.add_document(coll_ids[0], doc1)
  18. # doc2: [coll1, coll2]
  19. doc2 = client.documents.create(
  20. raw_text="Doc in coll1 and coll2" + random_suffix,
  21. run_with_orchestration=False,
  22. )["results"]["document_id"]
  23. client.collections.add_document(coll_ids[0], doc2)
  24. client.collections.add_document(coll_ids[1], doc2)
  25. # doc3: no collections
  26. doc3 = client.documents.create(
  27. raw_text="Doc in no collections" + random_suffix,
  28. run_with_orchestration=False,
  29. )["results"]["document_id"]
  30. # doc4: [coll3]
  31. doc4 = client.documents.create(
  32. raw_text="Doc in coll3" + random_suffix, run_with_orchestration=False
  33. )["results"]["document_id"]
  34. client.collections.add_document(coll_ids[2], doc4)
  35. yield {"coll_ids": coll_ids, "doc_ids": [doc1, doc2, doc3, doc4]}
  36. # Cleanup
  37. for d_id in [doc1, doc2, doc3, doc4]:
  38. try:
  39. client.documents.delete(id=d_id)
  40. except R2RException:
  41. pass
  42. for c_id in coll_ids:
  43. try:
  44. client.collections.delete(c_id)
  45. except R2RException:
  46. pass
  47. def test_collection_id_eq_filter(client, setup_docs_with_collections):
  48. coll_ids = setup_docs_with_collections["coll_ids"]
  49. doc_ids = setup_docs_with_collections["doc_ids"]
  50. doc1, doc2, doc3, doc4 = doc_ids
  51. # collection_id = coll_ids[0] should match doc1 and doc2 only
  52. filters = {"collection_id": {"$eq": coll_ids[0]}}
  53. listed = client.retrieval.search(
  54. query="whoami", search_settings={"filters": filters}
  55. )["results"]["chunk_search_results"]
  56. found_ids = {d["document_id"] for d in listed}
  57. assert {
  58. doc1,
  59. doc2,
  60. } == found_ids, f"Expected doc1 and doc2, got {found_ids}"
  61. def test_collection_id_ne_filter(client, setup_docs_with_collections):
  62. coll_ids = setup_docs_with_collections["coll_ids"]
  63. doc_ids = setup_docs_with_collections["doc_ids"]
  64. doc1, doc2, doc3, doc4 = doc_ids
  65. # collection_id != coll_ids[0] means docs that are NOT in coll0
  66. # Those are doc3 (no collections) and doc4 (in coll3 only)
  67. filters = {"collection_id": {"$ne": coll_ids[0]}}
  68. # listed = client.documents.list(limit=10, offset=0, filters=filters)["results"]
  69. listed = client.retrieval.search(
  70. query="whoami", search_settings={"filters": filters}
  71. )["results"]["chunk_search_results"]
  72. found_ids = {d["document_id"] for d in listed}
  73. assert {
  74. doc3,
  75. doc4,
  76. } == found_ids, f"Expected doc3 and doc4, got {found_ids}"
  77. def test_collection_id_in_filter(client, setup_docs_with_collections):
  78. coll_ids = setup_docs_with_collections["coll_ids"]
  79. doc_ids = setup_docs_with_collections["doc_ids"]
  80. doc1, doc2, doc3, doc4 = doc_ids
  81. # collection_id in [coll_ids[0], coll_ids[2]] means docs in either coll0 or coll2
  82. # doc1 in coll0, doc2 in coll0, doc4 in coll2
  83. # doc3 is in none
  84. filters = {"collection_id": {"$in": [coll_ids[0], coll_ids[2]]}}
  85. listed = client.retrieval.search(
  86. query="whoami", search_settings={"filters": filters}
  87. )["results"]["chunk_search_results"]
  88. found_ids = {d["document_id"] for d in listed}
  89. assert {
  90. doc1,
  91. doc2,
  92. doc4,
  93. } == found_ids, f"Expected doc1, doc2, doc4, got {found_ids}"
  94. def test_collection_id_nin_filter(client, setup_docs_with_collections):
  95. coll_ids = setup_docs_with_collections["coll_ids"]
  96. doc_ids = setup_docs_with_collections["doc_ids"]
  97. doc1, doc2, doc3, doc4 = doc_ids
  98. # collection_id nin [coll_ids[1]] means docs that do NOT belong to coll1
  99. # doc2 belongs to coll1, so exclude doc2
  100. # doc1, doc3, doc4 remain
  101. filters = {"collection_id": {"$nin": [coll_ids[1]]}}
  102. listed = client.retrieval.search(
  103. query="whoami", search_settings={"filters": filters}
  104. )["results"]["chunk_search_results"]
  105. found_ids = {d["document_id"] for d in listed}
  106. assert {
  107. doc1,
  108. doc3,
  109. doc4,
  110. } == found_ids, f"Expected doc1, doc3, doc4, got {found_ids}"
  111. def test_collection_id_contains_filter(client, setup_docs_with_collections):
  112. coll_ids = setup_docs_with_collections["coll_ids"]
  113. doc_ids = setup_docs_with_collections["doc_ids"]
  114. doc1, doc2, doc3, doc4 = doc_ids
  115. # $contains: For a single collection_id, we interpret as arrays that must contain the given UUID.
  116. # If collection_id {"$contains": "coll_ids[0]"}, docs must have coll0 in their array
  117. # That would be doc1 and doc2 only
  118. filters = {"collection_id": {"$contains": coll_ids[0]}}
  119. listed = client.retrieval.search(
  120. query="whoami", search_settings={"filters": filters}
  121. )["results"]["chunk_search_results"]
  122. found_ids = {d["document_id"] for d in listed}
  123. assert {
  124. doc1,
  125. doc2,
  126. } == found_ids, f"Expected doc1 and doc2, got {found_ids}"
  127. def test_collection_id_contains_multiple(client, setup_docs_with_collections):
  128. coll_ids = setup_docs_with_collections["coll_ids"]
  129. doc_ids = setup_docs_with_collections["doc_ids"]
  130. doc1, doc2, doc3, doc4 = doc_ids
  131. # If we allow $contains with a list, e.g., {"$contains": [coll_ids[0], coll_ids[1]]},
  132. # this should mean the doc's collection_ids contain ALL of these.
  133. # Only doc2 has coll0 AND coll1. doc1 only has coll0, doc3 no collections, doc4 only coll3.
  134. filters = {"collection_id": {"$contains": [coll_ids[0], coll_ids[1]]}}
  135. listed = client.retrieval.search(
  136. query="whoami", search_settings={"filters": filters}
  137. )["results"]["chunk_search_results"]
  138. found_ids = {d["document_id"] for d in listed}
  139. assert {doc2} == found_ids, f"Expected doc2 only, got {found_ids}"
  140. def test_delete_by_collection_id_eq(client, setup_docs_with_collections):
  141. coll_ids = setup_docs_with_collections["coll_ids"]
  142. doc1, doc2, doc3, doc4 = setup_docs_with_collections["doc_ids"]
  143. # Delete documents in coll0
  144. filters = {"collection_id": {"$eq": coll_ids[0]}}
  145. del_resp = client.documents.delete_by_filter(filters)["results"]
  146. assert del_resp["success"], "Failed to delete by collection_id $eq filter"
  147. # doc1 and doc2 should be deleted, doc3 and doc4 remain
  148. for d_id in [doc1, doc2]:
  149. with pytest.raises(R2RException) as exc:
  150. client.documents.retrieve(d_id)
  151. assert exc.value.status_code == 404, f"Doc {d_id} still exists!"
  152. # Check doc3 and doc4 still exist
  153. assert client.documents.retrieve(doc3)
  154. assert client.documents.retrieve(doc4)