test_retrieval.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892
  1. import uuid
  2. import pytest
  3. from core.base import Message, SearchMode
  4. from r2r import R2RClient, R2RException
  5. @pytest.fixture(scope="session")
  6. def config():
  7. class TestConfig:
  8. base_url = "http://localhost:7272"
  9. superuser_email = "admin@example.com"
  10. superuser_password = "change_me_immediately"
  11. return TestConfig()
  12. @pytest.fixture(scope="session")
  13. def client(config):
  14. """Create a client instance and log in as a superuser."""
  15. client = R2RClient(config.base_url)
  16. client.users.login(config.superuser_email, config.superuser_password)
  17. return client
  18. def test_search_basic_mode(client: R2RClient):
  19. results = client.retrieval.search(query="Aristotle",
  20. search_mode="basic").results
  21. assert results is not None, "No results field in search response"
  22. def test_search_advanced_mode_with_filters(client: R2RClient):
  23. filters = {"metadata.document_type": {"$eq": "txt"}}
  24. results = client.retrieval.search(
  25. query="Philosophy",
  26. search_mode="advanced",
  27. search_settings={
  28. "filters": filters,
  29. "limit": 5
  30. },
  31. ).results
  32. assert results is not None, "No results in advanced mode search"
  33. def test_search_custom_mode(client: R2RClient):
  34. results = client.retrieval.search(
  35. query="Greek philosophers",
  36. search_mode="custom",
  37. search_settings={
  38. "use_semantic_search": True,
  39. "limit": 3
  40. },
  41. ).results
  42. assert results is not None, "No results in custom mode search"
  43. def test_rag_query(client: R2RClient):
  44. results = client.retrieval.rag(
  45. query="Summarize Aristotle's contributions to logic",
  46. rag_generation_config={
  47. "stream": False,
  48. "max_tokens": 100
  49. },
  50. search_settings={
  51. "use_semantic_search": True,
  52. "limit": 3
  53. },
  54. ).results
  55. assert results.completion is not None, "RAG response missing 'completion'"
  56. def test_rag_with_filter(client: R2RClient):
  57. # Ensure a doc with metadata.tier='test' is created
  58. # generate a random string
  59. suffix = str(uuid.uuid4())
  60. client.documents.create(
  61. raw_text=
  62. f"Aristotle was a Greek philosopher, contributions to philosophy were in logic, {suffix}.",
  63. metadata={"tier": "test"},
  64. )
  65. results = client.retrieval.rag(
  66. query="What were aristotle's contributions to philosophy?",
  67. rag_generation_config={
  68. "stream": False,
  69. "max_tokens": 100
  70. },
  71. search_settings={
  72. "filters": {
  73. "metadata.tier": {
  74. "$eq": "test"
  75. }
  76. },
  77. "use_semantic_search": True,
  78. "limit": 3,
  79. },
  80. ).results
  81. assert results.completion is not None, "RAG response missing 'completion'"
  82. def test_rag_stream_query(client: R2RClient):
  83. resp = client.retrieval.rag(
  84. query="Detail the philosophical schools Aristotle influenced",
  85. rag_generation_config={
  86. "stream": True,
  87. "max_tokens": 50
  88. },
  89. search_settings={
  90. "use_semantic_search": True,
  91. "limit": 2
  92. },
  93. )
  94. # Consume a few chunks from the async generator
  95. def consume_stream():
  96. count = 0
  97. for chunk in resp:
  98. count += 1
  99. if count > 1:
  100. break
  101. return count
  102. # count = asyncio.run(consume_stream())
  103. count = consume_stream()
  104. assert count > 0, "No chunks received from streamed RAG query"
  105. def test_agent_query(client: R2RClient):
  106. msg = Message(role="user", content="What is Aristotle known for?")
  107. results = client.retrieval.agent(
  108. message=msg,
  109. rag_generation_config={
  110. "stream": False,
  111. "max_tokens": 100
  112. },
  113. search_settings={
  114. "use_semantic_search": True,
  115. "limit": 3
  116. },
  117. ).results
  118. assert results is not None, "Agent response missing 'results'"
  119. assert len(results.messages) > 0, "No messages returned by agent"
  120. def test_agent_query_stream(client: R2RClient):
  121. msg = Message(role="user", content="Explain Aristotle's logic in steps.")
  122. resp = client.retrieval.agent(
  123. message=msg,
  124. rag_generation_config={
  125. "stream": True,
  126. "max_tokens": 50
  127. },
  128. search_settings={
  129. "use_semantic_search": True,
  130. "limit": 3
  131. },
  132. )
  133. def consume_stream():
  134. count = 0
  135. for chunk in resp:
  136. count += 1
  137. if count > 1:
  138. break
  139. return count
  140. count = consume_stream() # asyncio.run(consume_stream())
  141. assert count > 0, "No streaming chunks received from agent"
  142. def test_completion(client: R2RClient):
  143. messages = [
  144. {
  145. "role": "system",
  146. "content": "You are a helpful assistant."
  147. },
  148. {
  149. "role": "user",
  150. "content": "What is the capital of France?"
  151. },
  152. {
  153. "role": "assistant",
  154. "content": "The capital of France is Paris."
  155. },
  156. {
  157. "role": "user",
  158. "content": "What about Italy?"
  159. },
  160. ]
  161. resp = client.retrieval.completion(
  162. messages,
  163. generation_config={
  164. "max_tokens": 50,
  165. "model": "openai/gpt-4.1"
  166. },
  167. )
  168. assert resp.results is not None, "Completion response missing 'results'"
  169. assert resp.results.choices is not None, "No choices in completion result"
  170. def test_embedding(client: R2RClient):
  171. text = "Who is Aristotle?"
  172. resp = client.retrieval.embedding(text=text).results
  173. assert len(resp) > 0, "No embedding vector returned"
  174. def test_error_handling(client: R2RClient):
  175. # Missing query should raise an error
  176. with pytest.raises(R2RException) as exc_info:
  177. client.retrieval.search(query=None) # type: ignore
  178. assert exc_info.value.status_code in [
  179. 400,
  180. 422,
  181. ], "Expected validation error for missing query"
  182. def test_no_results_scenario(client: R2RClient):
  183. results = client.retrieval.search(
  184. query="aslkfjaldfjal",
  185. search_mode="custom",
  186. search_settings={
  187. "limit": 5,
  188. "use_semantic_search": False,
  189. "use_fulltext_search": True,
  190. },
  191. ).results
  192. results = results.chunk_search_results
  193. assert len(results) == 0, "Expected no results for nonsense query"
  194. def test_pagination_limit_one(client: R2RClient):
  195. client.documents.create(chunks=[
  196. "a" + " " + str(uuid.uuid4()),
  197. "b" + " " + str(uuid.uuid4()),
  198. "c" + " " + str(uuid.uuid4()),
  199. ])
  200. results = client.retrieval.search(query="Aristotle",
  201. search_mode="basic",
  202. search_settings={
  203. "limit": 1
  204. }).results
  205. assert len(results.chunk_search_results) == 1, (
  206. "Expected one result with limit=1")
  207. def test_pagination_offset(client: R2RClient):
  208. resp0 = client.retrieval.search(
  209. query="Aristotle",
  210. search_mode="basic",
  211. search_settings={
  212. "limit": 1,
  213. "offset": 0
  214. },
  215. ).results
  216. resp1 = client.retrieval.search(
  217. query="Aristotle",
  218. search_mode="basic",
  219. search_settings={
  220. "limit": 1,
  221. "offset": 1
  222. },
  223. ).results
  224. assert (resp0.chunk_search_results[0].text
  225. != resp1.chunk_search_results[0].text
  226. ), "Offset should return different results"
  227. def test_rag_task_prompt(client: R2RClient):
  228. custom_prompt = """
  229. Answer the query given immediately below given the context. End your answer with: [END-TEST-PROMPT]
  230. ### Query:
  231. {query}
  232. ### Context:
  233. {context}
  234. """
  235. results = client.retrieval.rag(
  236. query="Tell me about Aristotle",
  237. rag_generation_config={"stream": False}, # , "max_tokens": 50},
  238. search_settings={"use_semantic_search": True, "limit": 3},
  239. task_prompt=custom_prompt,
  240. ).results
  241. answer = results.completion
  242. assert "[END-TEST-PROMPT]" in answer, (
  243. "Custom prompt override not reflected in RAG answer")
  244. def test_agent_conversation_id(client: R2RClient):
  245. conversation_id = client.conversations.create().results.id
  246. msg = Message(role="user", content="What is Aristotle known for?")
  247. results = client.retrieval.agent(
  248. message=msg,
  249. rag_generation_config={
  250. "stream": False,
  251. "max_tokens": 50
  252. },
  253. search_settings={
  254. "use_semantic_search": True,
  255. "limit": 3
  256. },
  257. conversation_id=str(conversation_id),
  258. ).results
  259. assert len(
  260. results.messages) > 0, ("No results from agent with conversation_id")
  261. msg2 = Message(role="user", content="Can you elaborate more?")
  262. results2 = client.retrieval.agent(
  263. message=msg2,
  264. rag_generation_config={
  265. "stream": False,
  266. "max_tokens": 50
  267. },
  268. search_settings={
  269. "use_semantic_search": True,
  270. "limit": 3
  271. },
  272. conversation_id=str(conversation_id),
  273. ).results
  274. assert len(results2.messages) > 0, (
  275. "No results from agent in second turn of conversation")
  276. def test_complex_filters_and_fulltext(client: R2RClient, test_collection):
  277. # collection_id, doc_ids = _setup_collection_with_documents(client)
  278. user_id = client.users.me().results.id
  279. # rating > 5
  280. # include owner id and collection ids to make robust against other database interactions from other users
  281. filters = {
  282. "rating": {
  283. "$gt": 5
  284. },
  285. "owner_id": {
  286. "$eq": str(user_id)
  287. },
  288. "collection_ids": {
  289. "$overlap": [str(test_collection["collection_id"])]
  290. },
  291. }
  292. results = client.retrieval.search(
  293. query="a",
  294. search_mode=SearchMode.custom,
  295. search_settings={
  296. "use_semantic_search": True,
  297. "filters": filters
  298. },
  299. ).results
  300. results = results.chunk_search_results
  301. assert len(results) == 2, (
  302. f"Expected 2 docs with rating > 5, got {len(results)}")
  303. # category in [ancient, modern]
  304. filters = {
  305. "metadata.category": {
  306. "$in": ["ancient", "modern"]
  307. },
  308. "owner_id": {
  309. "$eq": str(user_id)
  310. },
  311. "collection_ids": {
  312. "$overlap": [str(test_collection["collection_id"])]
  313. },
  314. }
  315. results = client.retrieval.search(
  316. query="b",
  317. search_mode=SearchMode.custom,
  318. search_settings={
  319. "use_semantic_search": True,
  320. "filters": filters
  321. },
  322. ).results
  323. chunk_search_results = results.chunk_search_results
  324. assert len(chunk_search_results) == 4, (
  325. f"Expected all 4 docs, got {len(chunk_search_results)}")
  326. # rating > 5 AND category=modern
  327. filters = {
  328. "$and": [
  329. {
  330. "metadata.rating": {
  331. "$gt": 5
  332. }
  333. },
  334. {
  335. "metadata.category": {
  336. "$eq": "modern"
  337. }
  338. },
  339. {
  340. "owner_id": {
  341. "$eq": str(user_id)
  342. }
  343. },
  344. {
  345. "collection_ids": {
  346. "$overlap": [str(test_collection["collection_id"])]
  347. }
  348. },
  349. ],
  350. }
  351. results = client.retrieval.search(
  352. query="d",
  353. search_mode=SearchMode.custom,
  354. search_settings={
  355. "filters": filters
  356. },
  357. ).results
  358. chunk_search_results = results.chunk_search_results
  359. assert len(chunk_search_results) == 2, (
  360. f"Expected 2 modern docs with rating>5, got {len(chunk_search_results)}"
  361. )
  362. results = client.retrieval.search(
  363. query="unique_philosopher",
  364. search_mode=SearchMode.custom,
  365. search_settings={
  366. "use_fulltext_search": True,
  367. "use_semantic_search": False,
  368. "filters": {
  369. "owner_id": {
  370. "$eq": str(user_id)
  371. },
  372. "collection_ids": {
  373. "$overlap": [str(test_collection["collection_id"])]
  374. },
  375. },
  376. },
  377. ).results
  378. chunk_search_results = results.chunk_search_results
  379. assert len(chunk_search_results) == 1, (
  380. f"Expected 1 doc for unique_philosopher, got {len(chunk_search_results)}"
  381. )
  382. def test_complex_nested_filters(client: R2RClient, test_collection):
  383. # Setup docs
  384. # _setup_collection_with_documents(client)
  385. # ((category=ancient OR rating<5) AND tags contains 'philosophy')
  386. filters = {
  387. "$and": [
  388. {
  389. "$or": [
  390. {
  391. "metadata.category": {
  392. "$eq": "ancient"
  393. }
  394. },
  395. {
  396. "metadata.rating": {
  397. "$lt": 5
  398. }
  399. },
  400. ]
  401. },
  402. {
  403. "metadata.tags": {
  404. "$contains": ["philosophy"]
  405. }
  406. },
  407. {
  408. "owner_id": {
  409. "$eq": str(client.users.me().results.id)
  410. }
  411. },
  412. {
  413. "collection_ids": {
  414. "$overlap": [str(test_collection["collection_id"])]
  415. }
  416. },
  417. ],
  418. }
  419. results = client.retrieval.search(
  420. query="complex",
  421. search_settings={
  422. "filters": filters
  423. },
  424. ).results
  425. chunk_search_results = results.chunk_search_results
  426. assert (
  427. len(chunk_search_results) == 2
  428. ), f"Expected 2 docs, got {len(chunk_search_results)}"
  429. def test_filters_no_match(client: R2RClient):
  430. filters = {"metadata.category": {"$in": ["nonexistent"]}}
  431. results = client.retrieval.search(
  432. query="noresults",
  433. search_mode="custom",
  434. search_settings={
  435. "filters": filters
  436. },
  437. ).results
  438. chunk_search_results = results.chunk_search_results
  439. assert len(chunk_search_results) == 0, (
  440. f"Expected 0 docs, got {len(chunk_search_results)}")
  441. def test_pagination_extremes(client: R2RClient):
  442. total_entries = client.chunks.list().total_entries
  443. offset = total_entries + 100
  444. results = client.retrieval.search(
  445. query="Aristotle",
  446. search_mode="basic",
  447. search_settings={
  448. "limit": 10,
  449. "offset": offset
  450. },
  451. ).results
  452. chunk_search_results = results.chunk_search_results
  453. assert len(chunk_search_results) == 0, (
  454. f"Expected no results at large offset, got {len(chunk_search_results)}"
  455. )
  456. def test_full_text_stopwords(client: R2RClient):
  457. resp = client.retrieval.search(
  458. query="the",
  459. search_mode="custom",
  460. search_settings={
  461. "use_fulltext_search": True,
  462. "use_semantic_search": False,
  463. "limit": 5,
  464. },
  465. )
  466. assert resp.results is not None, (
  467. "No results field in stopword query response")
  468. def test_full_text_non_ascii(client: R2RClient):
  469. resp = client.retrieval.search(
  470. query="Aristotélēs",
  471. search_mode="custom",
  472. search_settings={
  473. "use_fulltext_search": True,
  474. "use_semantic_search": False,
  475. "limit": 3,
  476. },
  477. )
  478. assert resp.results is not None, (
  479. "No results field in non-ASCII query response")
  480. def test_missing_fields(client: R2RClient):
  481. filters = {"metadata.someNonExistentField": {"$eq": "anything"}}
  482. results = client.retrieval.search(
  483. query="missingfield",
  484. search_mode="custom",
  485. search_settings={
  486. "filters": filters
  487. },
  488. ).results
  489. chunk_search_results = results.chunk_search_results
  490. assert len(chunk_search_results) == 0, (
  491. f"Expected 0 docs for a non-existent field, got {len(chunk_search_results)}"
  492. )
  493. def test_rag_with_large_context(client: R2RClient):
  494. results = client.retrieval.rag(
  495. query="Explain the contributions of Kant in detail",
  496. rag_generation_config={
  497. "stream": False,
  498. "max_tokens": 200
  499. },
  500. search_settings={
  501. "use_semantic_search": True,
  502. "limit": 10
  503. },
  504. ).results
  505. assert results.completion is not None, (
  506. "RAG large context missing 'completion'")
  507. completion = results.completion
  508. assert len(completion) > 0, "RAG large context returned empty answer"
  509. def test_agent_long_conversation(client: R2RClient):
  510. conversation_id = client.conversations.create().results.id
  511. msg1 = Message(role="user", content="What were Aristotle's main ideas?")
  512. resp1 = client.retrieval.agent(
  513. message=msg1,
  514. rag_generation_config={
  515. "stream": False,
  516. "max_tokens": 100
  517. },
  518. search_settings={
  519. "use_semantic_search": True,
  520. "limit": 5
  521. },
  522. conversation_id=str(conversation_id),
  523. )
  524. assert resp1.results is not None, (
  525. "No results in first turn of conversation")
  526. msg2 = Message(role="user",
  527. content="How did these ideas influence modern philosophy?")
  528. resp2 = client.retrieval.agent(
  529. message=msg2,
  530. rag_generation_config={
  531. "stream": False,
  532. "max_tokens": 100
  533. },
  534. search_settings={
  535. "use_semantic_search": True,
  536. "limit": 5
  537. },
  538. conversation_id=str(conversation_id),
  539. )
  540. assert resp2.results is not None, (
  541. "No results in second turn of conversation")
  542. msg3 = Message(role="user", content="Now tell me about Descartes.")
  543. resp3 = client.retrieval.agent(
  544. message=msg3,
  545. rag_generation_config={
  546. "stream": False,
  547. "max_tokens": 100
  548. },
  549. search_settings={
  550. "use_semantic_search": True,
  551. "limit": 5
  552. },
  553. conversation_id=str(conversation_id),
  554. )
  555. assert resp3.results is not None, (
  556. "No results in third turn of conversation")
  557. def test_filter_by_document_type(client: R2RClient):
  558. random_suffix = str(uuid.uuid4())
  559. client.documents.create(chunks=[
  560. f"a {random_suffix}",
  561. f"b {random_suffix}",
  562. f"c {random_suffix}",
  563. ])
  564. filters = {"document_type": {"$eq": "txt"}}
  565. results = client.retrieval.search(query="a",
  566. search_settings={
  567. "filters": filters
  568. }).results
  569. chunk_search_results = results.chunk_search_results
  570. assert (
  571. len(chunk_search_results) > 0
  572. ), "No results found for filter by document type"
  573. def test_search_hyde_mode(client: R2RClient):
  574. """
  575. Integration test for HyDE search. We create a doc, then query with
  576. search_strategy='hyde'. We expect the system to generate hypothetical docs,
  577. embed them, and return chunk search results.
  578. """
  579. # 1) Create a test doc containing "Aristotle" text
  580. suffix = str(uuid.uuid4())
  581. client.documents.create(
  582. chunks=[
  583. f"Aristotle. Fulltext test doc. {uuid.uuid4()}",
  584. f"Plato. Fulltext test doc. {uuid.uuid4()}",
  585. f"Socrates. Fulltext test doc. {uuid.uuid4()}",
  586. f"Pythagoras. Fulltext test doc. {uuid.uuid4()}",
  587. f"Euclid. Fulltext test doc. {uuid.uuid4()}",
  588. ],
  589. metadata={"category": "test_hyde_fulltext"},
  590. )
  591. # 2) Perform a HyDE search
  592. resp = client.retrieval.search(
  593. query="Aristotle achievements?",
  594. search_mode="custom", # or 'basic'—the key is in search_settings below
  595. search_settings={
  596. "search_strategy": "hyde",
  597. "use_semantic_search": True,
  598. "limit": 5,
  599. # If you want multiple hypothetical docs:
  600. "num_sub_queries": 5,
  601. },
  602. )
  603. # 3) Validate the results
  604. results = resp.results
  605. assert results is not None, "No results returned by HyDE search"
  606. assert (
  607. len(results.chunk_search_results) == 25
  608. ), "Expected 25 chunk search results"
  609. chunk_results = results.chunk_search_results
  610. # We can't guarantee you have actual matches in your DB,
  611. # but we can at least confirm the structure is correct.
  612. # If your DB has a doc referencing "Aristotle," we might get hits:
  613. assert (
  614. chunk_results is not None
  615. ), "No chunk_search_results in HyDE search response"
  616. # Optionally you can assert chunk_results is not empty if you expect a match
  617. # but that depends on your environment.
  618. def test_search_rag_fusion_mode(client: R2RClient):
  619. """
  620. Integration test for RAG-Fusion search. For now, your code is a placeholder
  621. that calls _basic_search. But this ensures it doesn't error out and returns
  622. valid results.
  623. """
  624. suffix = str(uuid.uuid4())
  625. client.documents.create(
  626. raw_text=f"Plato was another Greek philosopher. RAGFusionTestDoc: {suffix}",
  627. metadata={"category": "test_rag_fusion"},
  628. )
  629. # 2) Perform a RAG-Fusion search
  630. resp = client.retrieval.search(
  631. query="Plato's contributions?",
  632. search_mode="custom",
  633. search_settings={
  634. "search_strategy": "rag_fusion",
  635. "use_semantic_search": True,
  636. "limit": 5,
  637. # "num_sub_queries": 3 if you actually implement it
  638. },
  639. )
  640. # 3) Validate the results
  641. results = resp.results
  642. assert results is not None, "No results returned by RAG-Fusion search"
  643. chunk_results = results.chunk_search_results
  644. assert chunk_results is not None, "No chunk_search_results for RAG-Fusion"
  645. # Possibly check if chunk_results is not empty if you have data
  646. assert (
  647. len(results.chunk_search_results) == 5
  648. ), "Expected 5 chunk search results"
  649. def test_rag_fusion_mode_with_subqueries(client: R2RClient):
  650. """
  651. If/when you actually implement multi-subquery logic for rag_fusion,
  652. you'd pass 'num_sub_queries': 3, etc.
  653. Currently it's a placeholder, but let's just confirm the service doesn't error out.
  654. """
  655. resp = client.retrieval.search(
  656. query="What are Plato's main dialogues?",
  657. search_mode="custom",
  658. search_settings={
  659. "search_strategy": "rag_fusion",
  660. "use_semantic_search": True,
  661. "limit": 5,
  662. "num_sub_queries": 3,
  663. },
  664. )
  665. results = resp.results
  666. assert (
  667. results is not None
  668. ), "No results returned by RAG-Fusion with subqueries"
  669. # When fully implemented, you can check if the chunk results are non-empty, etc.
  670. def test_collection_id_filters(client: R2RClient):
  671. """
  672. Test both collection_id and collection_ids filters to ensure they work properly
  673. with the updated filters.py code.
  674. """
  675. # Create a new collection for this test
  676. collection_response = client.collections.create(
  677. name=f"Collection Filter Test {uuid.uuid4()}"
  678. )
  679. collection_id = collection_response.results.id
  680. # Create a second collection to verify filtering works correctly
  681. other_collection_response = client.collections.create(
  682. name=f"Other Collection {uuid.uuid4()}"
  683. )
  684. other_collection_id = other_collection_response.results.id
  685. # Add unique identifier to track the test documents
  686. unique_marker = str(uuid.uuid4())
  687. # Create documents in the first collection
  688. for i in range(3):
  689. doc_response = client.documents.create(
  690. raw_text=f"Test document {i} for collection filter test with marker {unique_marker}",
  691. metadata={"test_group": "collection_filter_test"}
  692. )
  693. doc_id = doc_response.results.document_id
  694. # Add document to the first collection
  695. client.collections.add_document(
  696. id=collection_id,
  697. document_id=doc_id
  698. )
  699. # Create a document in the second collection
  700. doc_response = client.documents.create(
  701. raw_text=f"Test document in second collection with marker {unique_marker}",
  702. metadata={"test_group": "collection_filter_test"}
  703. )
  704. doc_id = doc_response.results.document_id
  705. # Add document to the second collection
  706. client.collections.add_document(
  707. id=other_collection_id,
  708. document_id=doc_id
  709. )
  710. # Wait for indexing to complete
  711. import time
  712. time.sleep(2)
  713. # Test 1: Using collection_id filter (singular form)
  714. results1 = client.retrieval.search(
  715. query=unique_marker,
  716. search_mode="custom",
  717. search_settings={
  718. "use_fulltext_search": True,
  719. "use_semantic_search": False,
  720. "filters": {
  721. "collection_id": {"$eq": str(collection_id)}
  722. }
  723. }
  724. ).results
  725. # Test 2: Using collection_ids filter (plural form)
  726. results2 = client.retrieval.search(
  727. query=unique_marker,
  728. search_mode="custom",
  729. search_settings={
  730. "use_fulltext_search": True,
  731. "use_semantic_search": False,
  732. "filters": {
  733. "collection_ids": {"$overlap": [str(collection_id)]}
  734. }
  735. }
  736. ).results
  737. # Test 3: Using $in operator with collection_id
  738. results3 = client.retrieval.search(
  739. query=unique_marker,
  740. search_mode="custom",
  741. search_settings={
  742. "use_fulltext_search": True,
  743. "use_semantic_search": False,
  744. "filters": {
  745. "collection_id": {"$in": [str(collection_id)]}
  746. }
  747. }
  748. ).results
  749. # Test 4: Using both collections with $overlap
  750. results4 = client.retrieval.search(
  751. query=unique_marker,
  752. search_mode="custom",
  753. search_settings={
  754. "use_fulltext_search": True,
  755. "use_semantic_search": False,
  756. "filters": {
  757. "collection_ids": {"$overlap": [str(collection_id), str(other_collection_id)]}
  758. }
  759. }
  760. ).results
  761. # Test 5: Using a non-existent collection ID
  762. results5 = client.retrieval.search(
  763. query=unique_marker,
  764. search_mode="custom",
  765. search_settings={
  766. "use_fulltext_search": True,
  767. "use_semantic_search": False,
  768. "filters": {
  769. "collection_id": {"$eq": str(uuid.uuid4())}
  770. }
  771. }
  772. ).results
  773. # Verify results
  774. # First three tests should return exactly 3 chunks from the first collection
  775. assert len(results1.chunk_search_results) == 3, f"collection_id $eq filter returned {len(results1.chunk_search_results)} results, expected 3"
  776. assert len(results2.chunk_search_results) == 3, f"collection_ids $overlap filter returned {len(results2.chunk_search_results)} results, expected 3"
  777. assert len(results3.chunk_search_results) == 3, f"collection_id $in filter returned {len(results3.chunk_search_results)} results, expected 3"
  778. # Test 4 should return all 4 chunks from both collections
  779. assert len(results4.chunk_search_results) == 4, f"collection_ids $overlap with multiple IDs returned {len(results4.chunk_search_results)} results, expected 4"
  780. # Test 5 should return no results for non-existent collection
  781. assert len(results5.chunk_search_results) == 0, f"Non-existent collection ID filter returned {len(results5.chunk_search_results)} results, expected 0"
  782. # Clean up
  783. client.collections.delete(id=collection_id)
  784. client.collections.delete(id=other_collection_id)