test_documents_cli.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330
  1. """
  2. Tests for the document commands in the CLI.
  3. - create
  4. - retrieve
  5. - list
  6. - delete
  7. - list-chunks
  8. - list-collections
  9. x ingest-files-from-url
  10. x extract
  11. x list-entities
  12. x list-relationships
  13. x create-sample
  14. x create-samples
  15. """
  16. import contextlib
  17. import json
  18. import os
  19. import tempfile
  20. import uuid
  21. import pytest
  22. from click.testing import CliRunner
  23. from cli.commands.documents import (
  24. create,
  25. delete,
  26. list,
  27. list_chunks,
  28. list_collections,
  29. retrieve,
  30. )
  31. from r2r import R2RAsyncClient
  32. from tests.cli.async_invoke import async_invoke
  33. @pytest.fixture
  34. def temp_text_file():
  35. """Create a temporary text file for testing."""
  36. with tempfile.NamedTemporaryFile(
  37. mode="w", suffix=".txt", delete=False
  38. ) as f:
  39. f.write("This is test content for document testing.")
  40. temp_path = f.name
  41. yield temp_path
  42. # Cleanup temp file
  43. if os.path.exists(temp_path):
  44. os.unlink(temp_path)
  45. @pytest.fixture
  46. def temp_json_file():
  47. """Create a temporary JSON file for testing."""
  48. with tempfile.NamedTemporaryFile(
  49. mode="w", suffix=".json", delete=False
  50. ) as f:
  51. json.dump({"test": "content", "for": "document testing"}, f)
  52. temp_path = f.name
  53. yield temp_path
  54. # Cleanup temp file
  55. if os.path.exists(temp_path):
  56. os.unlink(temp_path)
  57. def extract_json_block(output: str) -> dict:
  58. """Extract and parse the first valid JSON object found in the output."""
  59. # We assume the output contains at least one JSON object printed with json.dumps(indent=2).
  60. # We'll find the first '{' and the matching closing '}' that forms a valid JSON object.
  61. start = output.find("{")
  62. if start == -1:
  63. raise ValueError("No JSON object start found in output")
  64. # Track braces to find the matching '}'
  65. brace_count = 0
  66. for i, char in enumerate(output[start:], start=start):
  67. if char == "{":
  68. brace_count += 1
  69. elif char == "}":
  70. brace_count -= 1
  71. if brace_count == 0:
  72. # Found the matching closing brace
  73. json_str = output[start : i + 1].strip()
  74. return json.loads(json_str)
  75. raise ValueError("No complete JSON object found in output")
  76. @pytest.mark.asyncio
  77. async def test_document_lifecycle(temp_text_file):
  78. """Test the complete lifecycle of a document: create, retrieve, delete."""
  79. client = R2RAsyncClient(base_url="http://localhost:7272")
  80. runner = CliRunner(mix_stderr=False)
  81. # Create document
  82. create_result = await async_invoke(
  83. runner, create, temp_text_file, obj=client
  84. )
  85. assert create_result.exit_code == 0, create_result.stdout_bytes.decode()
  86. output = create_result.stdout_bytes.decode()
  87. create_response = extract_json_block(output)
  88. document_id = create_response["results"]["document_id"]
  89. try:
  90. # Retrieve document
  91. retrieve_result = await async_invoke(
  92. runner, retrieve, document_id, obj=client
  93. )
  94. assert (
  95. retrieve_result.exit_code == 0
  96. ), retrieve_result.stdout_bytes.decode()
  97. # Instead of parsing JSON, verify the ID appears in the table output
  98. retrieve_output = retrieve_result.stdout_bytes.decode()
  99. assert document_id in retrieve_output
  100. # List chunks
  101. list_chunks_result = await async_invoke(
  102. runner, list_chunks, document_id, obj=client
  103. )
  104. assert (
  105. list_chunks_result.exit_code == 0
  106. ), list_chunks_result.stdout_bytes.decode()
  107. # List collections
  108. list_collections_result = await async_invoke(
  109. runner, list_collections, document_id, obj=client
  110. )
  111. assert (
  112. list_collections_result.exit_code == 0
  113. ), list_collections_result.stdout_bytes.decode()
  114. finally:
  115. # Delete document
  116. delete_result = await async_invoke(
  117. runner, delete, document_id, obj=client
  118. )
  119. assert (
  120. delete_result.exit_code == 0
  121. ), delete_result.stdout_bytes.decode()
  122. @pytest.mark.asyncio
  123. async def test_create_multiple_documents(temp_text_file, temp_json_file):
  124. """Test creating multiple documents with metadata."""
  125. client = R2RAsyncClient(base_url="http://localhost:7272")
  126. runner = CliRunner(mix_stderr=False)
  127. metadatas = json.dumps(
  128. [
  129. {"description": "Test document 1"},
  130. {"description": "Test document 2"},
  131. ]
  132. )
  133. create_result = await async_invoke(
  134. runner,
  135. create,
  136. temp_text_file,
  137. temp_json_file,
  138. "--metadatas",
  139. metadatas,
  140. obj=client,
  141. )
  142. assert create_result.exit_code == 0, create_result.stdout_bytes.decode()
  143. output = create_result.stdout_bytes.decode()
  144. # The command may print multiple JSON objects separated by dashes and status lines.
  145. # Extract all JSON objects.
  146. json_objects = []
  147. start_idx = 0
  148. while True:
  149. try:
  150. # Attempt to extract a JSON object from output[start_idx:]
  151. block = extract_json_block(output[start_idx:])
  152. json_objects.append(block)
  153. # Move start_idx beyond this block to find the next one
  154. next_start = output[start_idx:].find("{")
  155. start_idx += output[start_idx:].find("{") + 1
  156. # Move past the first '{' we found
  157. # Actually, let's break after one extraction to avoid infinite loops if the output is large.
  158. # Instead, we find multiple objects by splitting on the line of dashes:
  159. break
  160. except ValueError:
  161. break
  162. # Alternatively, if multiple objects are separated by "----------", we can split and parse each:
  163. # This assumes each block between "----------" lines contains exactly one JSON object.
  164. blocks = output.split("-" * 40)
  165. json_objects = []
  166. for block in blocks:
  167. block = block.strip()
  168. if '"results"' in block and "{" in block and "}" in block:
  169. with contextlib.suppress(ValueError):
  170. json_objects.append(extract_json_block(block))
  171. assert (
  172. len(json_objects) == 2
  173. ), f"Expected 2 JSON objects, got {len(json_objects)}: {output}"
  174. document_ids = [obj["results"]["document_id"] for obj in json_objects]
  175. try:
  176. # List all documents
  177. list_result = await async_invoke(runner, list, obj=client)
  178. assert list_result.exit_code == 0, list_result.stdout_bytes.decode()
  179. # Verify both documents were created
  180. for doc_id in document_ids:
  181. retrieve_result = await async_invoke(
  182. runner, retrieve, doc_id, obj=client
  183. )
  184. assert (
  185. retrieve_result.exit_code == 0
  186. ), retrieve_result.stdout_bytes.decode()
  187. finally:
  188. # Cleanup - delete all created documents
  189. for doc_id in document_ids:
  190. delete_result = await async_invoke(
  191. runner, delete, doc_id, obj=client
  192. )
  193. assert (
  194. delete_result.exit_code == 0
  195. ), delete_result.stdout_bytes.decode()
  196. @pytest.mark.asyncio
  197. async def test_create_with_custom_id():
  198. """Test creating a document with a custom ID."""
  199. client = R2RAsyncClient(base_url="http://localhost:7272")
  200. runner = CliRunner(mix_stderr=False)
  201. custom_id = str(uuid.uuid4())
  202. with tempfile.NamedTemporaryFile(
  203. mode="w", suffix=".txt", delete=False
  204. ) as f:
  205. f.write("Test content")
  206. temp_path = f.name
  207. try:
  208. create_result = await async_invoke(
  209. runner, create, temp_path, "--ids", custom_id, obj=client
  210. )
  211. assert (
  212. create_result.exit_code == 0
  213. ), create_result.stdout_bytes.decode()
  214. output = create_result.stdout_bytes.decode()
  215. create_response = extract_json_block(output)
  216. assert create_response["results"]["document_id"] == custom_id
  217. finally:
  218. if os.path.exists(temp_path):
  219. os.unlink(temp_path)
  220. await async_invoke(runner, delete, custom_id, obj=client)
  221. @pytest.mark.asyncio
  222. async def test_retrieve_nonexistent_document():
  223. """Test retrieving a document that doesn't exist."""
  224. client = R2RAsyncClient(base_url="http://localhost:7272")
  225. runner = CliRunner(mix_stderr=False)
  226. nonexistent_id = str(uuid.uuid4())
  227. result = await async_invoke(runner, retrieve, nonexistent_id, obj=client)
  228. stderr = result.stderr_bytes.decode()
  229. assert (
  230. "Document not found" in stderr
  231. or "Document not found" in result.stdout_bytes.decode()
  232. )
  233. @pytest.mark.asyncio
  234. async def test_list_chunks_nonexistent_document():
  235. """Test listing chunks for a document that doesn't exist."""
  236. client = R2RAsyncClient(base_url="http://localhost:7272")
  237. runner = CliRunner(mix_stderr=False)
  238. nonexistent_id = str(uuid.uuid4())
  239. result = await async_invoke(
  240. runner, list_chunks, nonexistent_id, obj=client
  241. )
  242. stderr = result.stderr_bytes.decode()
  243. assert (
  244. "No chunks found for the given document ID." in stderr
  245. or "No chunks found for the given document ID."
  246. in result.stdout_bytes.decode()
  247. )
  248. # FIXME: This should be returning 'Document not found' but returns an empty list instead.
  249. # @pytest.mark.asyncio
  250. # async def test_list_collections_nonexistent_document():
  251. # """Test listing collections for a document that doesn't exist."""
  252. # client = R2RAsyncClient(base_url="http://localhost:7272")
  253. # runner = CliRunner(mix_stderr=False)
  254. # nonexistent_id = str(uuid.uuid4())
  255. # result = await async_invoke(
  256. # runner, list_collections, nonexistent_id, obj=client
  257. # )
  258. # stderr = result.stderr_bytes.decode()
  259. # assert (
  260. # "Document not found" in stderr
  261. # or "Document not found" in result.stdout_bytes.decode()
  262. # )
  263. @pytest.mark.asyncio
  264. async def test_delete_nonexistent_document():
  265. """Test deleting a document that doesn't exist."""
  266. client = R2RAsyncClient(base_url="http://localhost:7272")
  267. runner = CliRunner(mix_stderr=False)
  268. nonexistent_id = str(uuid.uuid4())
  269. result = await async_invoke(runner, delete, nonexistent_id, obj=client)
  270. stderr = result.stderr_bytes.decode()
  271. assert (
  272. "No entries found for deletion" in stderr
  273. or "No entries found for deletion" in result.stdout_bytes.decode()
  274. )