123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187 |
- import uuid
- import pytest
- from r2r import R2RException
- @pytest.fixture
- def setup_docs_with_collections(client):
- # Create some test collections
- random_suffix = str(uuid.uuid4())[:8]
- coll_ids = []
- for i in range(3):
- resp = client.collections.create(name=f"TestColl{i}")["results"]
- coll_ids.append(resp["id"])
- # Create documents with different collection arrangements:
- # doc1: [coll1]
- doc1 = client.documents.create(
- raw_text="Doc in coll1" + random_suffix, run_with_orchestration=False
- )["results"]["document_id"]
- client.collections.add_document(coll_ids[0], doc1)
- # doc2: [coll1, coll2]
- doc2 = client.documents.create(
- raw_text="Doc in coll1 and coll2" + random_suffix,
- run_with_orchestration=False,
- )["results"]["document_id"]
- client.collections.add_document(coll_ids[0], doc2)
- client.collections.add_document(coll_ids[1], doc2)
- # doc3: no collections
- doc3 = client.documents.create(
- raw_text="Doc in no collections" + random_suffix,
- run_with_orchestration=False,
- )["results"]["document_id"]
- # doc4: [coll3]
- doc4 = client.documents.create(
- raw_text="Doc in coll3" + random_suffix, run_with_orchestration=False
- )["results"]["document_id"]
- client.collections.add_document(coll_ids[2], doc4)
- yield {"coll_ids": coll_ids, "doc_ids": [doc1, doc2, doc3, doc4]}
- # Cleanup
- for d_id in [doc1, doc2, doc3, doc4]:
- try:
- client.documents.delete(id=d_id)
- except R2RException:
- pass
- for c_id in coll_ids:
- try:
- client.collections.delete(c_id)
- except R2RException:
- pass
- def test_collection_id_eq_filter(client, setup_docs_with_collections):
- coll_ids = setup_docs_with_collections["coll_ids"]
- doc_ids = setup_docs_with_collections["doc_ids"]
- doc1, doc2, doc3, doc4 = doc_ids
- # collection_id = coll_ids[0] should match doc1 and doc2 only
- filters = {"collection_id": {"$eq": coll_ids[0]}}
- listed = client.retrieval.search(
- query="whoami", search_settings={"filters": filters}
- )["results"]["chunk_search_results"]
- found_ids = {d["document_id"] for d in listed}
- assert {
- doc1,
- doc2,
- } == found_ids, f"Expected doc1 and doc2, got {found_ids}"
- def test_collection_id_ne_filter(client, setup_docs_with_collections):
- coll_ids = setup_docs_with_collections["coll_ids"]
- doc_ids = setup_docs_with_collections["doc_ids"]
- doc1, doc2, doc3, doc4 = doc_ids
- # collection_id != coll_ids[0] means docs that are NOT in coll0
- # Those are doc3 (no collections) and doc4 (in coll3 only)
- filters = {"collection_id": {"$ne": coll_ids[0]}}
- # listed = client.documents.list(limit=10, offset=0, filters=filters)["results"]
- listed = client.retrieval.search(
- query="whoami", search_settings={"filters": filters}
- )["results"]["chunk_search_results"]
- found_ids = {d["document_id"] for d in listed}
- assert {
- doc3,
- doc4,
- } == found_ids, f"Expected doc3 and doc4, got {found_ids}"
- def test_collection_id_in_filter(client, setup_docs_with_collections):
- coll_ids = setup_docs_with_collections["coll_ids"]
- doc_ids = setup_docs_with_collections["doc_ids"]
- doc1, doc2, doc3, doc4 = doc_ids
- # collection_id in [coll_ids[0], coll_ids[2]] means docs in either coll0 or coll2
- # doc1 in coll0, doc2 in coll0, doc4 in coll2
- # doc3 is in none
- filters = {"collection_id": {"$in": [coll_ids[0], coll_ids[2]]}}
- listed = client.retrieval.search(
- query="whoami", search_settings={"filters": filters}
- )["results"]["chunk_search_results"]
- found_ids = {d["document_id"] for d in listed}
- assert {
- doc1,
- doc2,
- doc4,
- } == found_ids, f"Expected doc1, doc2, doc4, got {found_ids}"
- def test_collection_id_nin_filter(client, setup_docs_with_collections):
- coll_ids = setup_docs_with_collections["coll_ids"]
- doc_ids = setup_docs_with_collections["doc_ids"]
- doc1, doc2, doc3, doc4 = doc_ids
- # collection_id nin [coll_ids[1]] means docs that do NOT belong to coll1
- # doc2 belongs to coll1, so exclude doc2
- # doc1, doc3, doc4 remain
- filters = {"collection_id": {"$nin": [coll_ids[1]]}}
- listed = client.retrieval.search(
- query="whoami", search_settings={"filters": filters}
- )["results"]["chunk_search_results"]
- found_ids = {d["document_id"] for d in listed}
- assert {
- doc1,
- doc3,
- doc4,
- } == found_ids, f"Expected doc1, doc3, doc4, got {found_ids}"
- def test_collection_id_contains_filter(client, setup_docs_with_collections):
- coll_ids = setup_docs_with_collections["coll_ids"]
- doc_ids = setup_docs_with_collections["doc_ids"]
- doc1, doc2, doc3, doc4 = doc_ids
- # $contains: For a single collection_id, we interpret as arrays that must contain the given UUID.
- # If collection_id {"$contains": "coll_ids[0]"}, docs must have coll0 in their array
- # That would be doc1 and doc2 only
- filters = {"collection_id": {"$contains": coll_ids[0]}}
- listed = client.retrieval.search(
- query="whoami", search_settings={"filters": filters}
- )["results"]["chunk_search_results"]
- found_ids = {d["document_id"] for d in listed}
- assert {
- doc1,
- doc2,
- } == found_ids, f"Expected doc1 and doc2, got {found_ids}"
- def test_collection_id_contains_multiple(client, setup_docs_with_collections):
- coll_ids = setup_docs_with_collections["coll_ids"]
- doc_ids = setup_docs_with_collections["doc_ids"]
- doc1, doc2, doc3, doc4 = doc_ids
- # If we allow $contains with a list, e.g., {"$contains": [coll_ids[0], coll_ids[1]]},
- # this should mean the doc's collection_ids contain ALL of these.
- # Only doc2 has coll0 AND coll1. doc1 only has coll0, doc3 no collections, doc4 only coll3.
- filters = {"collection_id": {"$contains": [coll_ids[0], coll_ids[1]]}}
- listed = client.retrieval.search(
- query="whoami", search_settings={"filters": filters}
- )["results"]["chunk_search_results"]
- found_ids = {d["document_id"] for d in listed}
- assert {doc2} == found_ids, f"Expected doc2 only, got {found_ids}"
- def test_delete_by_collection_id_eq(client, setup_docs_with_collections):
- coll_ids = setup_docs_with_collections["coll_ids"]
- doc1, doc2, doc3, doc4 = setup_docs_with_collections["doc_ids"]
- # Delete documents in coll0
- filters = {"collection_id": {"$eq": coll_ids[0]}}
- del_resp = client.documents.delete_by_filter(filters)["results"]
- assert del_resp["success"], "Failed to delete by collection_id $eq filter"
- # doc1 and doc2 should be deleted, doc3 and doc4 remain
- for d_id in [doc1, doc2]:
- with pytest.raises(R2RException) as exc:
- client.documents.retrieve(d_id)
- assert exc.value.status_code == 404, f"Doc {d_id} still exists!"
- # Check doc3 and doc4 still exist
- assert client.documents.retrieve(doc3)
- assert client.documents.retrieve(doc4)
|