documents.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569
  1. import json
  2. import os
  3. import tempfile
  4. import uuid
  5. from builtins import list as _list
  6. from typing import Any, Optional, Sequence
  7. from urllib.parse import urlparse
  8. from uuid import UUID
  9. import asyncclick as click
  10. import requests
  11. from asyncclick import pass_context
  12. from rich.box import ROUNDED
  13. from rich.console import Console
  14. from rich.table import Table
  15. from cli.utils.param_types import JSON
  16. from cli.utils.timer import timer
  17. from r2r import R2RAsyncClient, R2RException
  18. console = Console()
  19. @click.group()
  20. def documents():
  21. """Documents commands."""
  22. pass
  23. @documents.command()
  24. @click.argument(
  25. "file_paths", nargs=-1, required=True, type=click.Path(exists=True)
  26. )
  27. @click.option("--ids", multiple=True, help="Document IDs for ingestion")
  28. @click.option(
  29. "--metadatas", type=JSON, help="Metadatas for ingestion as a JSON string"
  30. )
  31. @click.option(
  32. "--run-without-orchestration", is_flag=True, help="Run with orchestration"
  33. )
  34. @pass_context
  35. async def create(
  36. ctx: click.Context,
  37. file_paths: tuple[str, ...],
  38. ids: Optional[tuple[str, ...]] = None,
  39. metadatas: Optional[Sequence[dict[str, Any]]] = None,
  40. run_without_orchestration: bool = False,
  41. ):
  42. """Ingest files into R2R."""
  43. client: R2RAsyncClient = ctx.obj
  44. run_with_orchestration = not run_without_orchestration
  45. responses: _list[dict[str, Any]] = []
  46. for idx, file_path in enumerate(file_paths):
  47. with timer():
  48. current_id = ids[idx] if ids and idx < len(ids) else None
  49. current_metadata = (
  50. metadatas[idx] if metadatas and idx < len(metadatas) else None
  51. )
  52. click.echo(
  53. f"Processing file {idx + 1}/{len(file_paths)}: {file_path}"
  54. )
  55. try:
  56. response = await client.documents.create(
  57. file_path=file_path,
  58. metadata=current_metadata,
  59. id=current_id,
  60. run_with_orchestration=run_with_orchestration,
  61. )
  62. responses.append(response) # type: ignore
  63. click.echo(json.dumps(response, indent=2))
  64. click.echo("-" * 40)
  65. except R2RException as e:
  66. click.echo(str(e), err=True)
  67. except Exception as e:
  68. click.echo(str(f"An unexpected error occurred: {e}"), err=True)
  69. click.echo(f"\nProcessed {len(responses)} files successfully.")
  70. @documents.command()
  71. @click.option("--ids", multiple=True, help="Document IDs to fetch")
  72. @click.option(
  73. "--offset",
  74. default=0,
  75. help="The offset to start from. Defaults to 0.",
  76. )
  77. @click.option(
  78. "--limit",
  79. default=100,
  80. help="The maximum number of nodes to return. Defaults to 100.",
  81. )
  82. @pass_context
  83. async def list(
  84. ctx: click.Context,
  85. ids: Optional[tuple[str, ...]] = None,
  86. offset: int = 0,
  87. limit: int = 100,
  88. ) -> None:
  89. """Get an overview of documents."""
  90. ids = list(ids) if ids else None
  91. client: R2RAsyncClient = ctx.obj
  92. try:
  93. with timer():
  94. response = await client.documents.list(
  95. ids=ids,
  96. offset=offset,
  97. limit=limit,
  98. )
  99. table = Table(
  100. title="[bold blue]Documents[/bold blue]",
  101. show_header=True,
  102. header_style="bold white on blue",
  103. border_style="blue",
  104. box=ROUNDED,
  105. pad_edge=False,
  106. collapse_padding=True,
  107. show_lines=True,
  108. )
  109. # Add columns based on your document structure
  110. table.add_column("ID", style="bright_yellow", no_wrap=True)
  111. table.add_column("Type", style="bright_magenta")
  112. table.add_column("Title", style="bright_green")
  113. table.add_column("Ingestion Status", style="bright_cyan")
  114. table.add_column("Extraction Status", style="bright_cyan")
  115. table.add_column("Summary", style="bright_white")
  116. table.add_column("Created At", style="bright_white")
  117. for document in response["results"]: # type: ignore
  118. table.add_row(
  119. document.get("id", ""),
  120. document.get("document_type", ""),
  121. document.get("title", ""),
  122. document.get("ingestion_status", ""),
  123. document.get("extraction_status", ""),
  124. document.get("summary", ""),
  125. document.get("created_at", "")[:19],
  126. )
  127. console = Console()
  128. console.print("\n")
  129. console.print(table)
  130. console.print(
  131. f"\n[dim]Showing {len(response['results'])} documents (offset: {offset}, limit: {limit})[/dim]" # type: ignore
  132. )
  133. except R2RException as e:
  134. console.print(f"[bold red]Error:[/bold red] {str(e)}")
  135. except Exception as e:
  136. console.print(f"[bold red]Unexpected error:[/bold red] {str(e)}")
  137. @documents.command()
  138. @click.argument("id", required=True, type=str)
  139. @pass_context
  140. async def retrieve(ctx: click.Context, id: UUID):
  141. """Retrieve a document by ID."""
  142. client: R2RAsyncClient = ctx.obj
  143. console = Console()
  144. try:
  145. with timer():
  146. response = await client.documents.retrieve(id=id)
  147. # Get the actual document data from the results
  148. document = response["results"] # type: ignore
  149. metadata_table = Table(
  150. show_header=True,
  151. header_style="bold white on blue",
  152. border_style="blue",
  153. box=ROUNDED,
  154. title="[bold blue]Document Details[/bold blue]",
  155. show_lines=True,
  156. )
  157. metadata_table.add_column("Field", style="bright_yellow")
  158. metadata_table.add_column("Value", style="bright_white")
  159. # Add core document information
  160. core_fields = [
  161. ("ID", document.get("id", "")),
  162. ("Type", document.get("document_type", "")),
  163. ("Title", document.get("title", "")),
  164. ("Created At", document.get("created_at", "")[:19]),
  165. ("Updated At", document.get("updated_at", "")[:19]),
  166. ("Ingestion Status", document.get("ingestion_status", "")),
  167. ("Extraction Status", document.get("extraction_status", "")),
  168. ("Size", f"{document.get('size_in_bytes', 0):,} bytes"),
  169. ]
  170. for field, value in core_fields:
  171. metadata_table.add_row(field, str(value))
  172. # Add metadata section if it exists
  173. if "metadata" in document:
  174. metadata_table.add_row(
  175. "[bold]Metadata[/bold]", "", style="bright_blue"
  176. )
  177. for key, value in document["metadata"].items():
  178. metadata_table.add_row(f" {key}", str(value))
  179. # Add summary if it exists
  180. if "summary" in document:
  181. metadata_table.add_row(
  182. "[bold]Summary[/bold]",
  183. document["summary"],
  184. )
  185. console.print("\n")
  186. console.print(metadata_table)
  187. console.print("\n")
  188. except R2RException as e:
  189. console.print(f"[bold red]Error:[/bold red] {str(e)}")
  190. except Exception as e:
  191. console.print(f"[bold red]Unexpected error:[/bold red] {str(e)}")
  192. @documents.command()
  193. @click.argument("id", required=True, type=str)
  194. @pass_context
  195. async def delete(ctx: click.Context, id):
  196. """Delete a document by ID."""
  197. client: R2RAsyncClient = ctx.obj
  198. try:
  199. with timer():
  200. response = await client.documents.delete(id=id)
  201. click.echo(json.dumps(response, indent=2))
  202. except R2RException as e:
  203. click.echo(str(e), err=True)
  204. except Exception as e:
  205. click.echo(str(f"An unexpected error occurred: {e}"), err=True)
  206. @documents.command()
  207. @click.argument("id", required=True, type=str)
  208. @click.option(
  209. "--offset",
  210. default=0,
  211. help="The offset to start from. Defaults to 0.",
  212. )
  213. @click.option(
  214. "--limit",
  215. default=100,
  216. help="The maximum number of nodes to return. Defaults to 100.",
  217. )
  218. @pass_context
  219. async def list_chunks(ctx: click.Context, id, offset, limit):
  220. """List chunks for a specific document."""
  221. client: R2RAsyncClient = ctx.obj
  222. console = Console()
  223. try:
  224. with timer():
  225. response = await client.documents.list_chunks(
  226. id=id,
  227. offset=offset,
  228. limit=limit,
  229. )
  230. table = Table(
  231. title="[bold blue]Document Chunks[/bold blue]",
  232. show_header=True,
  233. header_style="bold white on blue",
  234. border_style="blue",
  235. box=ROUNDED,
  236. pad_edge=False,
  237. collapse_padding=True,
  238. show_lines=True,
  239. )
  240. table.add_column("ID", style="bright_yellow", no_wrap=True)
  241. table.add_column("Text", style="bright_white")
  242. for chunk in response["results"]: # type: ignore
  243. table.add_row(
  244. chunk.get("id", ""),
  245. (
  246. chunk.get("text", "")[:200] + "..."
  247. if len(chunk.get("text", "")) > 200
  248. else chunk.get("text", "")
  249. ),
  250. )
  251. console.print("\n")
  252. console.print(table)
  253. console.print(
  254. f"\n[dim]Showing {len(response['results'])} chunks (offset: {offset}, limit: {limit})[/dim]" # type: ignore
  255. )
  256. except R2RException as e:
  257. console.print(f"[bold red]Error:[/bold red] {str(e)}")
  258. except Exception as e:
  259. console.print(f"[bold red]Unexpected error:[/bold red] {str(e)}")
  260. @documents.command()
  261. @click.argument("id", required=True, type=str)
  262. @click.option(
  263. "--offset",
  264. default=0,
  265. help="The offset to start from. Defaults to 0.",
  266. )
  267. @click.option(
  268. "--limit",
  269. default=100,
  270. help="The maximum number of nodes to return. Defaults to 100.",
  271. )
  272. @pass_context
  273. async def list_collections(ctx: click.Context, id, offset, limit):
  274. """List collections for a specific document."""
  275. client: R2RAsyncClient = ctx.obj
  276. console = Console()
  277. try:
  278. with timer():
  279. response = await client.documents.list_collections(
  280. id=id,
  281. offset=offset,
  282. limit=limit,
  283. )
  284. table = Table(
  285. title="[bold blue]Document Collections[/bold blue]",
  286. show_header=True,
  287. header_style="bold white on blue",
  288. border_style="blue",
  289. box=ROUNDED,
  290. pad_edge=False,
  291. collapse_padding=True,
  292. show_lines=True,
  293. )
  294. table.add_column("ID", style="bright_yellow", no_wrap=True)
  295. table.add_column("Name", style="bright_green")
  296. table.add_column("Description", style="bright_white")
  297. table.add_column("Created At", style="bright_white")
  298. for collection in response["results"]: # type: ignore
  299. table.add_row(
  300. collection.get("id", ""),
  301. collection.get("name", ""),
  302. collection.get("description", ""),
  303. collection.get("created_at", "")[:19],
  304. )
  305. console.print("\n")
  306. console.print(table)
  307. console.print(
  308. f"\n[dim]Showing {len(response['results'])} collections (offset: {offset}, limit: {limit})[/dim]" # type: ignore
  309. )
  310. except R2RException as e:
  311. console.print(f"[bold red]Error:[/bold red] {str(e)}")
  312. except Exception as e:
  313. console.print(f"[bold red]Unexpected error:[/bold red] {str(e)}")
  314. # TODO
  315. async def ingest_files_from_urls(client, urls):
  316. """Download and ingest files from given URLs."""
  317. files_to_ingest = []
  318. metadatas = []
  319. document_ids = []
  320. temp_files = []
  321. try:
  322. for url in urls:
  323. filename = os.path.basename(urlparse(url).path)
  324. is_pdf = filename.lower().endswith(".pdf")
  325. temp_file = tempfile.NamedTemporaryFile(
  326. mode="wb" if is_pdf else "w+",
  327. delete=False,
  328. suffix=f"_{filename}",
  329. )
  330. temp_files.append(temp_file)
  331. response = requests.get(url)
  332. response.raise_for_status()
  333. if is_pdf:
  334. temp_file.write(response.content)
  335. else:
  336. temp_file.write(response.text)
  337. temp_file.close()
  338. files_to_ingest.append(temp_file.name)
  339. metadatas.append({"title": filename})
  340. # TODO: use the utils function generate_document_id
  341. document_ids.append(str(uuid.uuid5(uuid.NAMESPACE_DNS, url)))
  342. for it, file in enumerate(files_to_ingest):
  343. click.echo(f"Ingesting file: {file}")
  344. response = await client.documents.create(
  345. file, metadata=metadatas[it], id=document_ids[it]
  346. )
  347. return response["results"]
  348. finally:
  349. # Clean up temporary files
  350. for temp_file in temp_files:
  351. os.unlink(temp_file.name)
  352. # Missing CLI Commands
  353. @documents.command()
  354. @click.argument("id", required=True, type=str)
  355. @click.option("--run-type", help="Extraction run type (estimate or run)")
  356. @click.option("--settings", type=JSON, help="Extraction settings as JSON")
  357. @click.option(
  358. "--run-without-orchestration",
  359. is_flag=True,
  360. help="Run without orchestration",
  361. )
  362. @pass_context
  363. async def extract(
  364. ctx: click.Context, id, run_type, settings, run_without_orchestration
  365. ):
  366. """Extract entities and relationships from a document."""
  367. client: R2RAsyncClient = ctx.obj
  368. run_with_orchestration = not run_without_orchestration
  369. with timer():
  370. response = await client.documents.extract(
  371. id=id,
  372. run_type=run_type,
  373. settings=settings,
  374. run_with_orchestration=run_with_orchestration,
  375. )
  376. click.echo(json.dumps(response, indent=2))
  377. @documents.command()
  378. @click.argument("id", required=True, type=str)
  379. @click.option(
  380. "--offset",
  381. default=0,
  382. help="The offset to start from. Defaults to 0.",
  383. )
  384. @click.option(
  385. "--limit",
  386. default=100,
  387. help="The maximum number of items to return. Defaults to 100.",
  388. )
  389. @click.option(
  390. "--include-embeddings",
  391. is_flag=True,
  392. help="Include embeddings in response",
  393. )
  394. @pass_context
  395. async def list_entities(
  396. ctx: click.Context, id, offset, limit, include_embeddings
  397. ):
  398. """List entities extracted from a document."""
  399. client: R2RAsyncClient = ctx.obj
  400. try:
  401. with timer():
  402. response = await client.documents.list_entities(
  403. id=id,
  404. offset=offset,
  405. limit=limit,
  406. include_embeddings=include_embeddings,
  407. )
  408. click.echo(json.dumps(response, indent=2))
  409. except R2RException as e:
  410. click.echo(str(e), err=True)
  411. except Exception as e:
  412. click.echo(str(f"An unexpected error occurred: {e}"), err=True)
  413. @documents.command()
  414. @click.argument("id", required=True, type=str)
  415. @click.option(
  416. "--offset",
  417. default=0,
  418. help="The offset to start from. Defaults to 0.",
  419. )
  420. @click.option(
  421. "--limit",
  422. default=100,
  423. help="The maximum number of items to return. Defaults to 100.",
  424. )
  425. @click.option(
  426. "--entity-names",
  427. multiple=True,
  428. help="Filter by entity names",
  429. )
  430. @click.option(
  431. "--relationship-types",
  432. multiple=True,
  433. help="Filter by relationship types",
  434. )
  435. @pass_context
  436. async def list_relationships(
  437. ctx: click.Context, id, offset, limit, entity_names, relationship_types
  438. ):
  439. """List relationships extracted from a document."""
  440. client: R2RAsyncClient = ctx.obj
  441. try:
  442. with timer():
  443. response = await client.documents.list_relationships(
  444. id=id,
  445. offset=offset,
  446. limit=limit,
  447. entity_names=list(entity_names) if entity_names else None,
  448. relationship_types=(
  449. list(relationship_types) if relationship_types else None
  450. ),
  451. )
  452. click.echo(json.dumps(response, indent=2))
  453. except R2RException as e:
  454. click.echo(str(e), err=True)
  455. except Exception as e:
  456. click.echo(str(f"An unexpected error occurred: {e}"), err=True)
  457. @documents.command()
  458. @pass_context
  459. async def create_sample(ctx: click.Context) -> None:
  460. """Ingest the first sample file into R2R."""
  461. sample_file_url = "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/aristotle.txt"
  462. client: R2RAsyncClient = ctx.obj
  463. try:
  464. with timer():
  465. response = await ingest_files_from_urls(client, [sample_file_url])
  466. click.echo(
  467. f"Sample file ingestion completed. Ingest files response:\n\n{response}"
  468. )
  469. except R2RException as e:
  470. click.echo(str(e), err=True)
  471. except Exception as e:
  472. click.echo(str(f"An unexpected error occurred: {e}"), err=True)
  473. @documents.command()
  474. @pass_context
  475. async def create_samples(ctx: click.Context) -> None:
  476. """Ingest multiple sample files into R2R."""
  477. urls = [
  478. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/pg_essay_3.html",
  479. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/pg_essay_4.html",
  480. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/pg_essay_5.html",
  481. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/lyft_2021.pdf",
  482. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/uber_2021.pdf",
  483. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/got.txt",
  484. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/pg_essay_1.html",
  485. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/pg_essay_2.html",
  486. "https://raw.githubusercontent.com/SciPhi-AI/R2R/main/py/core/examples/data/aristotle.txt",
  487. ]
  488. client: R2RAsyncClient = ctx.obj
  489. try:
  490. with timer():
  491. response = await ingest_files_from_urls(client, urls)
  492. click.echo(
  493. f"Sample files ingestion completed. Ingest files response:\n\n{response}"
  494. )
  495. except R2RException as e:
  496. click.echo(str(e), err=True)