documents.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. import json
  2. import os
  3. import tempfile
  4. import uuid
  5. from urllib.parse import urlparse
  6. import asyncclick as click
  7. import requests
  8. from asyncclick import pass_context
  9. from cli.utils.param_types import JSON
  10. from cli.utils.timer import timer
  11. from r2r import R2RAsyncClient
  12. @click.group()
  13. def documents():
  14. """Documents commands."""
  15. pass
  16. @documents.command()
  17. @click.argument(
  18. "file_paths", nargs=-1, required=True, type=click.Path(exists=True)
  19. )
  20. @click.option("--ids", multiple=True, help="Document IDs for ingestion")
  21. @click.option(
  22. "--metadatas", type=JSON, help="Metadatas for ingestion as a JSON string"
  23. )
  24. @click.option(
  25. "--run-without-orchestration", is_flag=True, help="Run with orchestration"
  26. )
  27. @pass_context
  28. async def create(ctx, file_paths, ids, metadatas, run_without_orchestration):
  29. """Ingest files into R2R."""
  30. client: R2RAsyncClient = ctx.obj
  31. run_with_orchestration = not run_without_orchestration
  32. responses = []
  33. for idx, file_path in enumerate(file_paths):
  34. with timer():
  35. current_id = [ids[idx]] if ids and idx < len(ids) else None
  36. current_metadata = (
  37. metadatas[idx] if metadatas and idx < len(metadatas) else None
  38. )
  39. click.echo(
  40. f"Processing file {idx + 1}/{len(file_paths)}: {file_path}"
  41. )
  42. response = await client.documents.create(
  43. file_path=file_path,
  44. metadata=current_metadata,
  45. id=current_id,
  46. run_with_orchestration=run_with_orchestration,
  47. )
  48. responses.append(response)
  49. click.echo(json.dumps(response, indent=2))
  50. click.echo("-" * 40)
  51. click.echo(f"\nProcessed {len(responses)} files successfully.")
  52. @documents.command()
  53. @click.argument("file_path", required=True, type=click.Path(exists=True))
  54. @click.option("--id", required=True, help="Existing document ID to update")
  55. @click.option(
  56. "--metadata", type=JSON, help="Metadatas for ingestion as a JSON string"
  57. )
  58. @click.option(
  59. "--run-without-orchestration", is_flag=True, help="Run with orchestration"
  60. )
  61. @pass_context
  62. async def update(ctx, file_path, id, metadata, run_without_orchestration):
  63. """Update an existing file in R2R."""
  64. client: R2RAsyncClient = ctx.obj
  65. run_with_orchestration = not run_without_orchestration
  66. responses = []
  67. with timer():
  68. click.echo(f"Updating file {id}: {file_path}")
  69. response = await client.documents.update(
  70. file_path=file_path,
  71. metadata=metadata,
  72. id=id,
  73. run_with_orchestration=run_with_orchestration,
  74. )
  75. responses.append(response)
  76. click.echo(json.dumps(response, indent=2))
  77. click.echo("-" * 40)
  78. click.echo(f"Updated file {id} file successfully.")
  79. @documents.command()
  80. @click.argument("id", required=True, type=str)
  81. @pass_context
  82. async def retrieve(ctx, id):
  83. """Retrieve a document by ID."""
  84. client: R2RAsyncClient = ctx.obj
  85. with timer():
  86. response = await client.documents.retrieve(id=id)
  87. click.echo(json.dumps(response, indent=2))
  88. @documents.command()
  89. @click.argument("id", required=True, type=str)
  90. @pass_context
  91. async def delete(ctx, id):
  92. """Delete a document by ID."""
  93. client: R2RAsyncClient = ctx.obj
  94. with timer():
  95. response = await client.documents.delete(id=id)
  96. click.echo(json.dumps(response, indent=2))
  97. @documents.command()
  98. @click.argument("id", required=True, type=str)
  99. @click.option(
  100. "--offset",
  101. default=0,
  102. help="The offset to start from. Defaults to 0.",
  103. )
  104. @click.option(
  105. "--limit",
  106. default=100,
  107. help="The maximum number of nodes to return. Defaults to 100.",
  108. )
  109. @pass_context
  110. async def list_chunks(ctx, id, offset, limit):
  111. """List collections for a specific document."""
  112. client: R2RAsyncClient = ctx.obj
  113. with timer():
  114. response = await client.documents.list_chunks(
  115. id=id,
  116. offset=offset,
  117. limit=limit,
  118. )
  119. click.echo(json.dumps(response, indent=2))
  120. @documents.command()
  121. @click.argument("id", required=True, type=str)
  122. @click.option(
  123. "--offset",
  124. default=0,
  125. help="The offset to start from. Defaults to 0.",
  126. )
  127. @click.option(
  128. "--limit",
  129. default=100,
  130. help="The maximum number of nodes to return. Defaults to 100.",
  131. )
  132. @pass_context
  133. async def list_collections(ctx, id, offset, limit):
  134. """List collections for a specific document."""
  135. client: R2RAsyncClient = ctx.obj
  136. with timer():
  137. response = await client.documents.list_collections(
  138. id=id,
  139. offset=offset,
  140. limit=limit,
  141. )
  142. click.echo(json.dumps(response, indent=2))
  143. # TODO
  144. async def ingest_files_from_urls(client, urls):
  145. """Download and ingest files from given URLs."""
  146. files_to_ingest = []
  147. metadatas = []
  148. document_ids = []
  149. temp_files = []
  150. try:
  151. for url in urls:
  152. filename = os.path.basename(urlparse(url).path)
  153. is_pdf = filename.lower().endswith(".pdf")
  154. temp_file = tempfile.NamedTemporaryFile(
  155. mode="wb" if is_pdf else "w+",
  156. delete=False,
  157. suffix=f"_{filename}",
  158. )
  159. temp_files.append(temp_file)
  160. response = requests.get(url)
  161. response.raise_for_status()
  162. if is_pdf:
  163. temp_file.write(response.content)
  164. else:
  165. temp_file.write(response.text)
  166. temp_file.close()
  167. files_to_ingest.append(temp_file.name)
  168. metadatas.append({"title": filename})
  169. # TODO: use the utils function generate_document_id
  170. document_ids.append(str(uuid.uuid5(uuid.NAMESPACE_DNS, url)))
  171. for it, file in enumerate(files_to_ingest):
  172. click.echo(f"Ingesting file: {file}")
  173. response = await client.documents.create(
  174. file, metadata=metadatas[it], id=document_ids[it]
  175. )
  176. return response["results"]
  177. finally:
  178. # Clean up temporary files
  179. for temp_file in temp_files:
  180. os.unlink(temp_file.name)
  181. # Missing CLI Commands
  182. @documents.command()
  183. @click.argument("id", required=True, type=str)
  184. @click.option("--run-type", help="Extraction run type (estimate or run)")
  185. @click.option("--settings", type=JSON, help="Extraction settings as JSON")
  186. @click.option(
  187. "--run-without-orchestration",
  188. is_flag=True,
  189. help="Run without orchestration",
  190. )
  191. @pass_context
  192. async def extract(ctx, id, run_type, settings, run_without_orchestration):
  193. """Extract entities and relationships from a document."""
  194. client: R2RAsyncClient = ctx.obj
  195. run_with_orchestration = not run_without_orchestration
  196. with timer():
  197. response = await client.documents.extract(
  198. id=id,
  199. run_type=run_type,
  200. settings=settings,
  201. run_with_orchestration=run_with_orchestration,
  202. )
  203. click.echo(json.dumps(response, indent=2))
  204. @documents.command()
  205. @click.argument("id", required=True, type=str)
  206. @click.option(
  207. "--offset",
  208. default=0,
  209. help="The offset to start from. Defaults to 0.",
  210. )
  211. @click.option(
  212. "--limit",
  213. default=100,
  214. help="The maximum number of items to return. Defaults to 100.",
  215. )
  216. @click.option(
  217. "--include-embeddings",
  218. is_flag=True,
  219. help="Include embeddings in response",
  220. )
  221. @pass_context
  222. async def list_entities(ctx, id, offset, limit, include_embeddings):
  223. """List entities extracted from a document."""
  224. client: R2RAsyncClient = ctx.obj
  225. with timer():
  226. response = await client.documents.list_entities(
  227. id=id,
  228. offset=offset,
  229. limit=limit,
  230. include_embeddings=include_embeddings,
  231. )
  232. click.echo(json.dumps(response, indent=2))
  233. @documents.command()
  234. @click.argument("id", required=True, type=str)
  235. @click.option(
  236. "--offset",
  237. default=0,
  238. help="The offset to start from. Defaults to 0.",
  239. )
  240. @click.option(
  241. "--limit",
  242. default=100,
  243. help="The maximum number of items to return. Defaults to 100.",
  244. )
  245. @click.option(
  246. "--entity-names",
  247. multiple=True,
  248. help="Filter by entity names",
  249. )
  250. @click.option(
  251. "--relationship-types",
  252. multiple=True,
  253. help="Filter by relationship types",
  254. )
  255. @pass_context
  256. async def list_relationships(
  257. ctx, id, offset, limit, entity_names, relationship_types
  258. ):
  259. """List relationships extracted from a document."""
  260. client: R2RAsyncClient = ctx.obj
  261. with timer():
  262. response = await client.documents.list_relationships(
  263. id=id,
  264. offset=offset,
  265. limit=limit,
  266. entity_names=list(entity_names) if entity_names else None,
  267. relationship_types=(
  268. list(relationship_types) if relationship_types else None
  269. ),
  270. )
  271. click.echo(json.dumps(response, indent=2))
  272. @documents.command()
  273. @click.option(
  274. "--v2", is_flag=True, help="use aristotle_v2.txt (a smaller file)"
  275. )
  276. @click.option(
  277. "--v3", is_flag=True, help="use aristotle_v3.txt (a larger file)"
  278. )
  279. @pass_context
  280. async def create_sample(ctx, v2=True, v3=False):
  281. """Ingest the first sample file into R2R."""
  282. sample_file_url = f"https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/aristotle.txt"
  283. client: R2RAsyncClient = ctx.obj
  284. with timer():
  285. response = await ingest_files_from_urls(client, [sample_file_url])
  286. click.echo(
  287. f"Sample file ingestion completed. Ingest files response:\n\n{response}"
  288. )
  289. @documents.command()
  290. @pass_context
  291. async def create_samples(ctx):
  292. """Ingest multiple sample files into R2R."""
  293. client: R2RAsyncClient = ctx.obj
  294. urls = [
  295. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/pg_essay_3.html",
  296. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/pg_essay_4.html",
  297. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/pg_essay_5.html",
  298. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/lyft_2021.pdf",
  299. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/uber_2021.pdf",
  300. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/got.txt",
  301. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/pg_essay_1.html",
  302. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/pg_essay_2.html",
  303. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/aristotle.txt",
  304. ]
  305. with timer():
  306. response = await ingest_files_from_urls(client, urls)
  307. click.echo(
  308. f"Sample files ingestion completed. Ingest files response:\n\n{response}"
  309. )
  310. @documents.command()
  311. @click.option("--ids", multiple=True, help="Document IDs to fetch")
  312. @click.option(
  313. "--offset",
  314. default=0,
  315. help="The offset to start from. Defaults to 0.",
  316. )
  317. @click.option(
  318. "--limit",
  319. default=100,
  320. help="The maximum number of nodes to return. Defaults to 100.",
  321. )
  322. @pass_context
  323. async def list(ctx, ids, offset, limit):
  324. """Get an overview of documents."""
  325. client: R2RAsyncClient = ctx.obj
  326. ids = list(ids) if ids else None
  327. with timer():
  328. response = await client.documents.list(
  329. ids=ids,
  330. offset=offset,
  331. limit=limit,
  332. )
  333. for document in response["results"]:
  334. click.echo(document)