documents_router.py 67 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695
  1. import base64
  2. import json
  3. import logging
  4. import mimetypes
  5. import textwrap
  6. from io import BytesIO
  7. from typing import Any, Optional
  8. from uuid import UUID
  9. from fastapi import Body, Depends, File, Form, Path, Query, UploadFile
  10. from fastapi.responses import StreamingResponse
  11. from pydantic import Json
  12. from core.base import (
  13. IngestionConfig,
  14. IngestionMode,
  15. R2RException,
  16. SearchMode,
  17. SearchSettings,
  18. UnprocessedChunk,
  19. Workflow,
  20. generate_document_id,
  21. generate_id,
  22. select_search_filters,
  23. )
  24. from core.base.abstractions import KGCreationSettings, KGRunType
  25. from core.base.api.models import (
  26. GenericBooleanResponse,
  27. WrappedBooleanResponse,
  28. WrappedChunksResponse,
  29. WrappedCollectionsResponse,
  30. WrappedDocumentResponse,
  31. WrappedDocumentsResponse,
  32. WrappedEntitiesResponse,
  33. WrappedGenericMessageResponse,
  34. WrappedIngestionResponse,
  35. WrappedRelationshipsResponse,
  36. )
  37. from core.utils import update_settings_from_dict
  38. from ...abstractions import R2RProviders, R2RServices
  39. from .base_router import BaseRouterV3
  40. logger = logging.getLogger()
  41. MAX_CHUNKS_PER_REQUEST = 1024 * 100
  42. def merge_search_settings(
  43. base: SearchSettings, overrides: SearchSettings
  44. ) -> SearchSettings:
  45. # Convert both to dict
  46. base_dict = base.model_dump()
  47. overrides_dict = overrides.model_dump(exclude_unset=True)
  48. # Update base_dict with values from overrides_dict
  49. # This ensures that any field set in overrides takes precedence
  50. for k, v in overrides_dict.items():
  51. base_dict[k] = v
  52. # Construct a new SearchSettings from the merged dict
  53. return SearchSettings(**base_dict)
  54. def merge_ingestion_config(
  55. base: IngestionConfig, overrides: IngestionConfig
  56. ) -> IngestionConfig:
  57. base_dict = base.model_dump()
  58. overrides_dict = overrides.model_dump(exclude_unset=True)
  59. for k, v in overrides_dict.items():
  60. base_dict[k] = v
  61. return IngestionConfig(**base_dict)
  62. class DocumentsRouter(BaseRouterV3):
  63. def __init__(
  64. self,
  65. providers: R2RProviders,
  66. services: R2RServices,
  67. ):
  68. super().__init__(providers, services)
  69. self._register_workflows()
  70. def _prepare_search_settings(
  71. self,
  72. auth_user: Any,
  73. search_mode: SearchMode,
  74. search_settings: Optional[SearchSettings],
  75. ) -> SearchSettings:
  76. """
  77. Prepare the effective search settings based on the provided search_mode,
  78. optional user-overrides in search_settings, and applied filters.
  79. """
  80. if search_mode != SearchMode.custom:
  81. # Start from mode defaults
  82. effective_settings = SearchSettings.get_default(search_mode.value)
  83. if search_settings:
  84. # Merge user-provided overrides
  85. effective_settings = merge_search_settings(
  86. effective_settings, search_settings
  87. )
  88. else:
  89. # Custom mode: use provided settings or defaults
  90. effective_settings = search_settings or SearchSettings()
  91. # Apply user-specific filters
  92. effective_settings.filters = select_search_filters(
  93. auth_user, effective_settings
  94. )
  95. return effective_settings
  96. # TODO - Remove this legacy method
  97. def _register_workflows(self):
  98. self.providers.orchestration.register_workflows(
  99. Workflow.INGESTION,
  100. self.services.ingestion,
  101. {
  102. "ingest-files": (
  103. "Ingest files task queued successfully."
  104. if self.providers.orchestration.config.provider != "simple"
  105. else "Document created and ingested successfully."
  106. ),
  107. "ingest-chunks": (
  108. "Ingest chunks task queued successfully."
  109. if self.providers.orchestration.config.provider != "simple"
  110. else "Document created and ingested successfully."
  111. ),
  112. "update-files": (
  113. "Update file task queued successfully."
  114. if self.providers.orchestration.config.provider != "simple"
  115. else "Update task queued successfully."
  116. ),
  117. "update-chunk": (
  118. "Update chunk task queued successfully."
  119. if self.providers.orchestration.config.provider != "simple"
  120. else "Chunk update completed successfully."
  121. ),
  122. "update-document-metadata": (
  123. "Update document metadata task queued successfully."
  124. if self.providers.orchestration.config.provider != "simple"
  125. else "Document metadata update completed successfully."
  126. ),
  127. "create-vector-index": (
  128. "Vector index creation task queued successfully."
  129. if self.providers.orchestration.config.provider != "simple"
  130. else "Vector index creation task completed successfully."
  131. ),
  132. "delete-vector-index": (
  133. "Vector index deletion task queued successfully."
  134. if self.providers.orchestration.config.provider != "simple"
  135. else "Vector index deletion task completed successfully."
  136. ),
  137. "select-vector-index": (
  138. "Vector index selection task queued successfully."
  139. if self.providers.orchestration.config.provider != "simple"
  140. else "Vector index selection task completed successfully."
  141. ),
  142. },
  143. )
  144. def _prepare_ingestion_config(
  145. self,
  146. ingestion_mode: IngestionMode,
  147. ingestion_config: Optional[IngestionConfig],
  148. ) -> IngestionConfig:
  149. # If not custom, start from defaults
  150. if ingestion_mode != IngestionMode.custom:
  151. effective_config = IngestionConfig.get_default(
  152. ingestion_mode.value, app=self.providers.auth.config.app
  153. )
  154. if ingestion_config:
  155. effective_config = merge_ingestion_config(
  156. effective_config, ingestion_config
  157. )
  158. else:
  159. # custom mode
  160. effective_config = ingestion_config or IngestionConfig(
  161. app=self.providers.auth.config.app
  162. )
  163. effective_config.validate_config()
  164. return effective_config
  165. def _setup_routes(self):
  166. @self.router.post(
  167. "/documents",
  168. dependencies=[Depends(self.rate_limit_dependency)],
  169. status_code=202,
  170. summary="Create a new document",
  171. openapi_extra={
  172. "x-codeSamples": [
  173. {
  174. "lang": "Python",
  175. "source": textwrap.dedent(
  176. """
  177. from r2r import R2RClient
  178. client = R2RClient("http://localhost:7272")
  179. # when using auth, do client.login(...)
  180. response = client.documents.create(
  181. file_path="pg_essay_1.html",
  182. metadata={"metadata_1":"some random metadata"},
  183. id=None
  184. )
  185. """
  186. ),
  187. },
  188. {
  189. "lang": "JavaScript",
  190. "source": textwrap.dedent(
  191. """
  192. const { r2rClient } = require("r2r-js");
  193. const client = new r2rClient("http://localhost:7272");
  194. function main() {
  195. const response = await client.documents.create({
  196. file: { path: "examples/data/marmeladov.txt", name: "marmeladov.txt" },
  197. metadata: { title: "marmeladov.txt" },
  198. });
  199. }
  200. main();
  201. """
  202. ),
  203. },
  204. {
  205. "lang": "CLI",
  206. "source": textwrap.dedent(
  207. """
  208. r2r documents create /path/to/file.txt
  209. """
  210. ),
  211. },
  212. {
  213. "lang": "cURL",
  214. "source": textwrap.dedent(
  215. """
  216. curl -X POST "https://api.example.com/v3/documents" \\
  217. -H "Content-Type: multipart/form-data" \\
  218. -H "Authorization: Bearer YOUR_API_KEY" \\
  219. -F "file=@pg_essay_1.html;type=text/html" \\
  220. -F 'metadata={}' \\
  221. -F 'id=null'
  222. """
  223. ),
  224. },
  225. ]
  226. },
  227. )
  228. @self.base_endpoint
  229. async def create_document(
  230. file: Optional[UploadFile] = File(
  231. None,
  232. description="The file to ingest. Exactly one of file, raw_text, or chunks must be provided.",
  233. ),
  234. raw_text: Optional[str] = Form(
  235. None,
  236. description="Raw text content to ingest. Exactly one of file, raw_text, or chunks must be provided.",
  237. ),
  238. chunks: Optional[Json[list[str]]] = Form(
  239. None,
  240. description="Pre-processed text chunks to ingest. Exactly one of file, raw_text, or chunks must be provided.",
  241. ),
  242. id: Optional[UUID] = Form(
  243. None,
  244. description="The ID of the document. If not provided, a new ID will be generated.",
  245. ),
  246. collection_ids: Optional[Json[list[UUID]]] = Form(
  247. None,
  248. description="Collection IDs to associate with the document. If none are provided, the document will be assigned to the user's default collection.",
  249. ),
  250. metadata: Optional[Json[dict]] = Form(
  251. None,
  252. description="Metadata to associate with the document, such as title, description, or custom fields.",
  253. ),
  254. ingestion_mode: IngestionMode = Form(
  255. default=IngestionMode.custom,
  256. description=(
  257. "Ingestion modes:\n"
  258. "- `hi-res`: Thorough ingestion with full summaries and enrichment.\n"
  259. "- `fast`: Quick ingestion with minimal enrichment and no summaries.\n"
  260. "- `custom`: Full control via `ingestion_config`.\n\n"
  261. "If `filters` or `limit` (in `ingestion_config`) are provided alongside `hi-res` or `fast`, "
  262. "they will override the default settings for that mode."
  263. ),
  264. ),
  265. ingestion_config: Optional[Json[IngestionConfig]] = Form(
  266. None,
  267. description="An optional dictionary to override the default chunking configuration for the ingestion process. If not provided, the system will use the default server-side chunking configuration.",
  268. ),
  269. run_with_orchestration: Optional[bool] = Form(
  270. True,
  271. description="Whether or not ingestion runs with orchestration, default is `True`. When set to `False`, the ingestion process will run synchronous and directly return the result.",
  272. ),
  273. auth_user=Depends(self.providers.auth.auth_wrapper()),
  274. ) -> WrappedIngestionResponse:
  275. """
  276. Creates a new Document object from an input file, text content, or chunks. The chosen `ingestion_mode` determines
  277. how the ingestion process is configured:
  278. **Ingestion Modes:**
  279. - `hi-res`: Comprehensive parsing and enrichment, including summaries and possibly more thorough parsing.
  280. - `fast`: Speed-focused ingestion that skips certain enrichment steps like summaries.
  281. - `custom`: Provide a full `ingestion_config` to customize the entire ingestion process.
  282. Either a file or text content must be provided, but not both. Documents are shared through `Collections` which allow for tightly specified cross-user interactions.
  283. The ingestion process runs asynchronously and its progress can be tracked using the returned
  284. task_id.
  285. """
  286. if not auth_user.is_superuser:
  287. user_document_count = (
  288. await self.services.management.documents_overview(
  289. user_ids=[auth_user.id],
  290. offset=0,
  291. limit=1,
  292. )
  293. )["total_entries"]
  294. user_max_documents = (
  295. await self.services.management.get_user_max_documents(
  296. auth_user.id
  297. )
  298. )
  299. if user_document_count >= user_max_documents:
  300. raise R2RException(
  301. status_code=403,
  302. message=f"User has reached the maximum number of documents allowed ({user_max_documents}).",
  303. )
  304. # Get chunks using the vector handler's list_chunks method
  305. user_chunk_count = (
  306. await self.services.ingestion.list_chunks(
  307. filters={"owner_id": {"$eq": str(auth_user.id)}},
  308. offset=0,
  309. limit=1,
  310. )
  311. )["page_info"]["total_entries"]
  312. user_max_chunks = (
  313. await self.services.management.get_user_max_chunks(
  314. auth_user.id
  315. )
  316. )
  317. if user_chunk_count >= user_max_chunks:
  318. raise R2RException(
  319. status_code=403,
  320. message=f"User has reached the maximum number of chunks allowed ({user_max_chunks}).",
  321. )
  322. user_collections_count = (
  323. await self.services.management.collections_overview(
  324. user_ids=[auth_user.id],
  325. offset=0,
  326. limit=1,
  327. )
  328. )["total_entries"]
  329. user_max_collections = (
  330. await self.services.management.get_user_max_collections(
  331. auth_user.id
  332. )
  333. )
  334. if user_collections_count >= user_max_collections:
  335. raise R2RException(
  336. status_code=403,
  337. message=f"User has reached the maximum number of collections allowed ({user_max_collections}).",
  338. )
  339. effective_ingestion_config = self._prepare_ingestion_config(
  340. ingestion_mode=ingestion_mode,
  341. ingestion_config=ingestion_config,
  342. )
  343. if not file and not raw_text and not chunks:
  344. raise R2RException(
  345. status_code=422,
  346. message="Either a `file`, `raw_text`, or `chunks` must be provided.",
  347. )
  348. if (
  349. (file and raw_text)
  350. or (file and chunks)
  351. or (raw_text and chunks)
  352. ):
  353. raise R2RException(
  354. status_code=422,
  355. message="Only one of `file`, `raw_text`, or `chunks` may be provided.",
  356. )
  357. # Check if the user is a superuser
  358. metadata = metadata or {}
  359. if chunks:
  360. if len(chunks) == 0:
  361. raise R2RException("Empty list of chunks provided", 400)
  362. if len(chunks) > MAX_CHUNKS_PER_REQUEST:
  363. raise R2RException(
  364. f"Maximum of {MAX_CHUNKS_PER_REQUEST} chunks per request",
  365. 400,
  366. )
  367. document_id = generate_document_id(
  368. json.dumps(chunks), auth_user.id
  369. )
  370. # FIXME: Metadata doesn't seem to be getting passed through
  371. raw_chunks_for_doc = [
  372. UnprocessedChunk(
  373. text=chunk,
  374. metadata=metadata,
  375. id=generate_id(),
  376. )
  377. for chunk in chunks
  378. ]
  379. # Prepare workflow input
  380. workflow_input = {
  381. "document_id": str(document_id),
  382. "chunks": [
  383. chunk.model_dump(mode="json")
  384. for chunk in raw_chunks_for_doc
  385. ],
  386. "metadata": metadata, # Base metadata for the document
  387. "user": auth_user.model_dump_json(),
  388. "ingestion_config": effective_ingestion_config.model_dump(
  389. mode="json"
  390. ),
  391. }
  392. # TODO - Modify create_chunks so that we can add chunks to existing document
  393. if run_with_orchestration:
  394. # Run ingestion with orchestration
  395. raw_message = (
  396. await self.providers.orchestration.run_workflow(
  397. "ingest-chunks",
  398. {"request": workflow_input},
  399. options={
  400. "additional_metadata": {
  401. "document_id": str(document_id),
  402. }
  403. },
  404. )
  405. )
  406. raw_message["document_id"] = str(document_id)
  407. return raw_message
  408. else:
  409. logger.info(
  410. "Running chunk ingestion without orchestration."
  411. )
  412. from core.main.orchestration import (
  413. simple_ingestion_factory,
  414. )
  415. simple_ingestor = simple_ingestion_factory(
  416. self.services.ingestion
  417. )
  418. await simple_ingestor["ingest-chunks"](workflow_input)
  419. return { # type: ignore
  420. "message": "Document created and ingested successfully.",
  421. "document_id": str(document_id),
  422. "task_id": None,
  423. }
  424. else:
  425. if file:
  426. file_data = await self._process_file(file)
  427. content_length = len(file_data["content"])
  428. file_content = BytesIO(
  429. base64.b64decode(file_data["content"])
  430. )
  431. file_data.pop("content", None)
  432. document_id = id or generate_document_id(
  433. file_data["filename"], auth_user.id
  434. )
  435. elif raw_text:
  436. content_length = len(raw_text)
  437. file_content = BytesIO(raw_text.encode("utf-8"))
  438. document_id = id or generate_document_id(
  439. raw_text, auth_user.id
  440. )
  441. file_data = {
  442. "filename": "N/A",
  443. "content_type": "text/plain",
  444. }
  445. else:
  446. raise R2RException(
  447. status_code=422,
  448. message="Either a file or content must be provided.",
  449. )
  450. workflow_input = {
  451. "file_data": file_data,
  452. "document_id": str(document_id),
  453. "collection_ids": (
  454. [str(cid) for cid in collection_ids]
  455. if collection_ids
  456. else None
  457. ),
  458. "metadata": metadata,
  459. "ingestion_config": effective_ingestion_config.model_dump(
  460. mode="json"
  461. ),
  462. "user": auth_user.model_dump_json(),
  463. "size_in_bytes": content_length,
  464. }
  465. file_name = file_data["filename"]
  466. await self.providers.database.files_handler.store_file(
  467. document_id,
  468. file_name,
  469. file_content,
  470. file_data["content_type"],
  471. )
  472. if run_with_orchestration:
  473. raw_message: dict[str, str | None] = await self.providers.orchestration.run_workflow( # type: ignore
  474. "ingest-files",
  475. {"request": workflow_input},
  476. options={
  477. "additional_metadata": {
  478. "document_id": str(document_id),
  479. }
  480. },
  481. )
  482. raw_message["document_id"] = str(document_id)
  483. return raw_message # type: ignore
  484. else:
  485. logger.info(
  486. f"Running ingestion without orchestration for file {file_name} and document_id {document_id}."
  487. )
  488. # TODO - Clean up implementation logic here to be more explicitly `synchronous`
  489. from core.main.orchestration import simple_ingestion_factory
  490. simple_ingestor = simple_ingestion_factory(
  491. self.services.ingestion
  492. )
  493. await simple_ingestor["ingest-files"](workflow_input)
  494. return { # type: ignore
  495. "message": "Document created and ingested successfully.",
  496. "document_id": str(document_id),
  497. "task_id": None,
  498. }
  499. @self.router.get(
  500. "/documents",
  501. dependencies=[Depends(self.rate_limit_dependency)],
  502. summary="List documents",
  503. openapi_extra={
  504. "x-codeSamples": [
  505. {
  506. "lang": "Python",
  507. "source": textwrap.dedent(
  508. """
  509. from r2r import R2RClient
  510. client = R2RClient("http://localhost:7272")
  511. # when using auth, do client.login(...)
  512. response = client.documents.list(
  513. limit=10,
  514. offset=0
  515. )
  516. """
  517. ),
  518. },
  519. {
  520. "lang": "JavaScript",
  521. "source": textwrap.dedent(
  522. """
  523. const { r2rClient } = require("r2r-js");
  524. const client = new r2rClient("http://localhost:7272");
  525. function main() {
  526. const response = await client.documents.list({
  527. limit: 10,
  528. offset: 0,
  529. });
  530. }
  531. main();
  532. """
  533. ),
  534. },
  535. {
  536. "lang": "CLI",
  537. "source": textwrap.dedent(
  538. """
  539. r2r documents create /path/to/file.txt
  540. """
  541. ),
  542. },
  543. {
  544. "lang": "cURL",
  545. "source": textwrap.dedent(
  546. """
  547. curl -X GET "https://api.example.com/v3/documents" \\
  548. -H "Authorization: Bearer YOUR_API_KEY"
  549. """
  550. ),
  551. },
  552. ]
  553. },
  554. )
  555. @self.base_endpoint
  556. async def get_documents(
  557. ids: list[str] = Query(
  558. [],
  559. description="A list of document IDs to retrieve. If not provided, all documents will be returned.",
  560. ),
  561. offset: int = Query(
  562. 0,
  563. ge=0,
  564. description="Specifies the number of objects to skip. Defaults to 0.",
  565. ),
  566. limit: int = Query(
  567. 100,
  568. ge=1,
  569. le=1000,
  570. description="Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.",
  571. ),
  572. include_summary_embeddings: int = Query(
  573. False,
  574. description="Specifies whether or not to include embeddings of each document summary.",
  575. ),
  576. auth_user=Depends(self.providers.auth.auth_wrapper()),
  577. ) -> WrappedDocumentsResponse:
  578. """
  579. Returns a paginated list of documents the authenticated user has access to.
  580. Results can be filtered by providing specific document IDs. Regular users will only see
  581. documents they own or have access to through collections. Superusers can see all documents.
  582. The documents are returned in order of last modification, with most recent first.
  583. """
  584. requesting_user_id = (
  585. None if auth_user.is_superuser else [auth_user.id]
  586. )
  587. filter_collection_ids = (
  588. None if auth_user.is_superuser else auth_user.collection_ids
  589. )
  590. document_uuids = [UUID(document_id) for document_id in ids]
  591. documents_overview_response = (
  592. await self.services.management.documents_overview(
  593. user_ids=requesting_user_id,
  594. collection_ids=filter_collection_ids,
  595. document_ids=document_uuids,
  596. offset=offset,
  597. limit=limit,
  598. )
  599. )
  600. if not include_summary_embeddings:
  601. for document in documents_overview_response["results"]:
  602. document.summary_embedding = None
  603. return ( # type: ignore
  604. documents_overview_response["results"],
  605. {
  606. "total_entries": documents_overview_response[
  607. "total_entries"
  608. ]
  609. },
  610. )
  611. @self.router.get(
  612. "/documents/{id}",
  613. dependencies=[Depends(self.rate_limit_dependency)],
  614. summary="Retrieve a document",
  615. openapi_extra={
  616. "x-codeSamples": [
  617. {
  618. "lang": "Python",
  619. "source": textwrap.dedent(
  620. """
  621. from r2r import R2RClient
  622. client = R2RClient("http://localhost:7272")
  623. # when using auth, do client.login(...)
  624. response = client.documents.retrieve(
  625. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  626. )
  627. """
  628. ),
  629. },
  630. {
  631. "lang": "JavaScript",
  632. "source": textwrap.dedent(
  633. """
  634. const { r2rClient } = require("r2r-js");
  635. const client = new r2rClient("http://localhost:7272");
  636. function main() {
  637. const response = await client.documents.retrieve({
  638. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  639. });
  640. }
  641. main();
  642. """
  643. ),
  644. },
  645. {
  646. "lang": "CLI",
  647. "source": textwrap.dedent(
  648. """
  649. r2r documents retrieve b4ac4dd6-5f27-596e-a55b-7cf242ca30aa
  650. """
  651. ),
  652. },
  653. {
  654. "lang": "cURL",
  655. "source": textwrap.dedent(
  656. """
  657. curl -X GET "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa" \\
  658. -H "Authorization: Bearer YOUR_API_KEY"
  659. """
  660. ),
  661. },
  662. ]
  663. },
  664. )
  665. @self.base_endpoint
  666. async def get_document(
  667. id: UUID = Path(
  668. ...,
  669. description="The ID of the document to retrieve.",
  670. ),
  671. auth_user=Depends(self.providers.auth.auth_wrapper()),
  672. ) -> WrappedDocumentResponse:
  673. """
  674. Retrieves detailed information about a specific document by its ID.
  675. This endpoint returns the document's metadata, status, and system information. It does not
  676. return the document's content - use the `/documents/{id}/download` endpoint for that.
  677. Users can only retrieve documents they own or have access to through collections.
  678. Superusers can retrieve any document.
  679. """
  680. request_user_ids = (
  681. None if auth_user.is_superuser else [auth_user.id]
  682. )
  683. filter_collection_ids = (
  684. None if auth_user.is_superuser else auth_user.collection_ids
  685. )
  686. documents_overview_response = await self.services.management.documents_overview( # FIXME: This was using the pagination defaults from before... We need to review if this is as intended.
  687. user_ids=request_user_ids,
  688. collection_ids=filter_collection_ids,
  689. document_ids=[id],
  690. offset=0,
  691. limit=100,
  692. )
  693. results = documents_overview_response["results"]
  694. if len(results) == 0:
  695. raise R2RException("Document not found.", 404)
  696. return results[0]
  697. @self.router.get(
  698. "/documents/{id}/chunks",
  699. dependencies=[Depends(self.rate_limit_dependency)],
  700. summary="List document chunks",
  701. openapi_extra={
  702. "x-codeSamples": [
  703. {
  704. "lang": "Python",
  705. "source": textwrap.dedent(
  706. """
  707. from r2r import R2RClient
  708. client = R2RClient("http://localhost:7272")
  709. # when using auth, do client.login(...)
  710. response = client.documents.list_chunks(
  711. id="32b6a70f-a995-5c51-85d2-834f06283a1e"
  712. )
  713. """
  714. ),
  715. },
  716. {
  717. "lang": "JavaScript",
  718. "source": textwrap.dedent(
  719. """
  720. const { r2rClient } = require("r2r-js");
  721. const client = new r2rClient("http://localhost:7272");
  722. function main() {
  723. const response = await client.documents.listChunks({
  724. id: "32b6a70f-a995-5c51-85d2-834f06283a1e",
  725. });
  726. }
  727. main();
  728. """
  729. ),
  730. },
  731. {
  732. "lang": "CLI",
  733. "source": textwrap.dedent(
  734. """
  735. r2r documents list-chunks b4ac4dd6-5f27-596e-a55b-7cf242ca30aa
  736. """
  737. ),
  738. },
  739. {
  740. "lang": "cURL",
  741. "source": textwrap.dedent(
  742. """
  743. curl -X GET "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa/chunks" \\
  744. -H "Authorization: Bearer YOUR_API_KEY"\
  745. """
  746. ),
  747. },
  748. ]
  749. },
  750. )
  751. @self.base_endpoint
  752. async def list_chunks(
  753. id: UUID = Path(
  754. ...,
  755. description="The ID of the document to retrieve chunks for.",
  756. ),
  757. offset: int = Query(
  758. 0,
  759. ge=0,
  760. description="Specifies the number of objects to skip. Defaults to 0.",
  761. ),
  762. limit: int = Query(
  763. 100,
  764. ge=1,
  765. le=1000,
  766. description="Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.",
  767. ),
  768. include_vectors: Optional[bool] = Query(
  769. False,
  770. description="Whether to include vector embeddings in the response.",
  771. ),
  772. auth_user=Depends(self.providers.auth.auth_wrapper()),
  773. ) -> WrappedChunksResponse:
  774. """
  775. Retrieves the text chunks that were generated from a document during ingestion.
  776. Chunks represent semantic sections of the document and are used for retrieval
  777. and analysis.
  778. Users can only access chunks from documents they own or have access to through
  779. collections. Vector embeddings are only included if specifically requested.
  780. Results are returned in chunk sequence order, representing their position in
  781. the original document.
  782. """
  783. list_document_chunks = (
  784. await self.services.management.list_document_chunks(
  785. id, offset, limit, include_vectors
  786. )
  787. )
  788. if not list_document_chunks["results"]:
  789. raise R2RException(
  790. "No chunks found for the given document ID.", 404
  791. )
  792. is_owner = str(
  793. list_document_chunks["results"][0].get("owner_id")
  794. ) == str(auth_user.id)
  795. document_collections = (
  796. await self.services.management.collections_overview(
  797. offset=0,
  798. limit=-1,
  799. document_ids=[id],
  800. )
  801. )
  802. user_has_access = (
  803. is_owner
  804. or set(auth_user.collection_ids).intersection(
  805. {ele.id for ele in document_collections["results"]}
  806. )
  807. != set()
  808. )
  809. if not user_has_access and not auth_user.is_superuser:
  810. raise R2RException(
  811. "Not authorized to access this document's chunks.", 403
  812. )
  813. return ( # type: ignore
  814. list_document_chunks["results"],
  815. {"total_entries": list_document_chunks["total_entries"]},
  816. )
  817. @self.router.get(
  818. "/documents/{id}/download",
  819. dependencies=[Depends(self.rate_limit_dependency)],
  820. response_class=StreamingResponse,
  821. summary="Download document content",
  822. openapi_extra={
  823. "x-codeSamples": [
  824. {
  825. "lang": "Python",
  826. "source": textwrap.dedent(
  827. """
  828. from r2r import R2RClient
  829. client = R2RClient("http://localhost:7272")
  830. # when using auth, do client.login(...)
  831. response = client.documents.download(
  832. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  833. )
  834. """
  835. ),
  836. },
  837. {
  838. "lang": "JavaScript",
  839. "source": textwrap.dedent(
  840. """
  841. const { r2rClient } = require("r2r-js");
  842. const client = new r2rClient("http://localhost:7272");
  843. function main() {
  844. const response = await client.documents.download({
  845. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  846. });
  847. }
  848. main();
  849. """
  850. ),
  851. },
  852. {
  853. "lang": "cURL",
  854. "source": textwrap.dedent(
  855. """
  856. curl -X GET "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa/download" \\
  857. -H "Authorization: Bearer YOUR_API_KEY"
  858. """
  859. ),
  860. },
  861. ]
  862. },
  863. )
  864. @self.base_endpoint
  865. async def get_document_file(
  866. id: str = Path(..., description="Document ID"),
  867. auth_user=Depends(self.providers.auth.auth_wrapper()),
  868. ) -> StreamingResponse:
  869. """
  870. Downloads the original file content of a document.
  871. For uploaded files, returns the original file with its proper MIME type.
  872. For text-only documents, returns the content as plain text.
  873. Users can only download documents they own or have access to through collections.
  874. """
  875. try:
  876. document_uuid = UUID(id)
  877. except ValueError:
  878. raise R2RException(
  879. status_code=422, message="Invalid document ID format."
  880. )
  881. # Retrieve the document's information
  882. documents_overview_response = (
  883. await self.services.management.documents_overview(
  884. user_ids=None,
  885. collection_ids=None,
  886. document_ids=[document_uuid],
  887. offset=0,
  888. limit=1,
  889. )
  890. )
  891. if not documents_overview_response["results"]:
  892. raise R2RException("Document not found.", 404)
  893. document = documents_overview_response["results"][0]
  894. is_owner = str(document.owner_id) == str(auth_user.id)
  895. if not auth_user.is_superuser and not is_owner:
  896. document_collections = (
  897. await self.services.management.collections_overview(
  898. offset=0,
  899. limit=-1,
  900. document_ids=[document_uuid],
  901. )
  902. )
  903. document_collection_ids = {
  904. str(ele.id) for ele in document_collections["results"]
  905. }
  906. user_collection_ids = {
  907. str(cid) for cid in auth_user.collection_ids
  908. }
  909. has_collection_access = user_collection_ids.intersection(
  910. document_collection_ids
  911. )
  912. if not has_collection_access:
  913. raise R2RException(
  914. "Not authorized to access this document.", 403
  915. )
  916. file_tuple = await self.services.management.download_file(
  917. document_uuid
  918. )
  919. if not file_tuple:
  920. raise R2RException(status_code=404, message="File not found.")
  921. file_name, file_content, file_size = file_tuple
  922. mime_type, _ = mimetypes.guess_type(file_name)
  923. if not mime_type:
  924. mime_type = "application/octet-stream"
  925. async def file_stream():
  926. chunk_size = 1024 * 1024 # 1MB
  927. while True:
  928. data = file_content.read(chunk_size)
  929. if not data:
  930. break
  931. yield data
  932. return StreamingResponse(
  933. file_stream(),
  934. media_type=mime_type,
  935. headers={
  936. "Content-Disposition": f'inline; filename="{file_name}"',
  937. "Content-Length": str(file_size),
  938. },
  939. )
  940. @self.router.delete(
  941. "/documents/by-filter",
  942. dependencies=[Depends(self.rate_limit_dependency)],
  943. summary="Delete documents by filter",
  944. openapi_extra={
  945. "x-codeSamples": [
  946. {
  947. "lang": "Python",
  948. "source": textwrap.dedent(
  949. """
  950. from r2r import R2RClient
  951. client = R2RClient("http://localhost:7272")
  952. # when using auth, do client.login(...)
  953. response = client.documents.delete_by_filter(
  954. filters={"document_type": {"$eq": "txt"}}
  955. )
  956. """
  957. ),
  958. },
  959. {
  960. "lang": "cURL",
  961. "source": textwrap.dedent(
  962. """
  963. curl -X DELETE "https://api.example.com/v3/documents/by-filter?filters=%7B%22document_type%22%3A%7B%22%24eq%22%3A%22text%22%7D%2C%22created_at%22%3A%7B%22%24lt%22%3A%222023-01-01T00%3A00%3A00Z%22%7D%7D" \\
  964. -H "Authorization: Bearer YOUR_API_KEY"
  965. """
  966. ),
  967. },
  968. ]
  969. },
  970. )
  971. @self.base_endpoint
  972. async def delete_document_by_filter(
  973. filters: Json[dict] = Body(
  974. ..., description="JSON-encoded filters"
  975. ),
  976. auth_user=Depends(self.providers.auth.auth_wrapper()),
  977. ) -> WrappedBooleanResponse:
  978. """
  979. Delete documents based on provided filters. Allowed operators include `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `like`, `ilike`, `in`, and `nin`. Deletion requests are limited to a user's own documents.
  980. """
  981. filters_dict = {
  982. "$and": [{"owner_id": {"$eq": str(auth_user.id)}}, filters]
  983. }
  984. await self.services.management.delete_documents_and_chunks_by_filter(
  985. filters=filters_dict
  986. )
  987. return GenericBooleanResponse(success=True) # type: ignore
  988. @self.router.delete(
  989. "/documents/{id}",
  990. dependencies=[Depends(self.rate_limit_dependency)],
  991. summary="Delete a document",
  992. openapi_extra={
  993. "x-codeSamples": [
  994. {
  995. "lang": "Python",
  996. "source": textwrap.dedent(
  997. """
  998. from r2r import R2RClient
  999. client = R2RClient("http://localhost:7272")
  1000. # when using auth, do client.login(...)
  1001. response = client.documents.delete(
  1002. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  1003. )
  1004. """
  1005. ),
  1006. },
  1007. {
  1008. "lang": "JavaScript",
  1009. "source": textwrap.dedent(
  1010. """
  1011. const { r2rClient } = require("r2r-js");
  1012. const client = new r2rClient("http://localhost:7272");
  1013. function main() {
  1014. const response = await client.documents.delete({
  1015. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1016. });
  1017. }
  1018. main();
  1019. """
  1020. ),
  1021. },
  1022. {
  1023. "lang": "CLI",
  1024. "source": textwrap.dedent(
  1025. """
  1026. r2r documents delete b4ac4dd6-5f27-596e-a55b-7cf242ca30aa
  1027. """
  1028. ),
  1029. },
  1030. {
  1031. "lang": "cURL",
  1032. "source": textwrap.dedent(
  1033. """
  1034. curl -X DELETE "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa" \\
  1035. -H "Authorization: Bearer YOUR_API_KEY"
  1036. """
  1037. ),
  1038. },
  1039. ]
  1040. },
  1041. )
  1042. @self.base_endpoint
  1043. async def delete_document_by_id(
  1044. id: UUID = Path(..., description="Document ID"),
  1045. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1046. ) -> WrappedBooleanResponse:
  1047. """
  1048. Delete a specific document. All chunks corresponding to the document are deleted, and all other references to the document are removed.
  1049. NOTE - Deletions do not yet impact the knowledge graph or other derived data. This feature is planned for a future release.
  1050. """
  1051. filters = {"document_id": {"$eq": str(id)}}
  1052. if not auth_user.is_superuser:
  1053. filters = {
  1054. "$and": [{"owner_id": {"$eq": str(auth_user.id)}}, filters]
  1055. }
  1056. await self.services.management.delete_documents_and_chunks_by_filter(
  1057. filters=filters
  1058. )
  1059. return GenericBooleanResponse(success=True) # type: ignore
  1060. @self.router.get(
  1061. "/documents/{id}/collections",
  1062. dependencies=[Depends(self.rate_limit_dependency)],
  1063. summary="List document collections",
  1064. openapi_extra={
  1065. "x-codeSamples": [
  1066. {
  1067. "lang": "Python",
  1068. "source": textwrap.dedent(
  1069. """
  1070. from r2r import R2RClient
  1071. client = R2RClient("http://localhost:7272")
  1072. # when using auth, do client.login(...)
  1073. response = client.documents.list_collections(
  1074. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa", offset=0, limit=10
  1075. )
  1076. """
  1077. ),
  1078. },
  1079. {
  1080. "lang": "JavaScript",
  1081. "source": textwrap.dedent(
  1082. """
  1083. const { r2rClient } = require("r2r-js");
  1084. const client = new r2rClient("http://localhost:7272");
  1085. function main() {
  1086. const response = await client.documents.listCollections({
  1087. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1088. });
  1089. }
  1090. main();
  1091. """
  1092. ),
  1093. },
  1094. {
  1095. "lang": "CLI",
  1096. "source": textwrap.dedent(
  1097. """
  1098. r2r documents list-collections b4ac4dd6-5f27-596e-a55b-7cf242ca30aa
  1099. """
  1100. ),
  1101. },
  1102. {
  1103. "lang": "cURL",
  1104. "source": textwrap.dedent(
  1105. """
  1106. curl -X GET "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa/collections" \\
  1107. -H "Authorization: Bearer YOUR_API_KEY"
  1108. """
  1109. ),
  1110. },
  1111. ]
  1112. },
  1113. )
  1114. @self.base_endpoint
  1115. async def get_document_collections(
  1116. id: str = Path(..., description="Document ID"),
  1117. offset: int = Query(
  1118. 0,
  1119. ge=0,
  1120. description="Specifies the number of objects to skip. Defaults to 0.",
  1121. ),
  1122. limit: int = Query(
  1123. 100,
  1124. ge=1,
  1125. le=1000,
  1126. description="Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.",
  1127. ),
  1128. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1129. ) -> WrappedCollectionsResponse:
  1130. """
  1131. Retrieves all collections that contain the specified document. This endpoint is restricted
  1132. to superusers only and provides a system-wide view of document organization.
  1133. Collections are used to organize documents and manage access control. A document can belong
  1134. to multiple collections, and users can access documents through collection membership.
  1135. The results are paginated and ordered by collection creation date, with the most recently
  1136. created collections appearing first.
  1137. NOTE - This endpoint is only available to superusers, it will be extended to regular users in a future release.
  1138. """
  1139. if not auth_user.is_superuser:
  1140. raise R2RException(
  1141. "Only a superuser can get the collections belonging to a document.",
  1142. 403,
  1143. )
  1144. collections_response = (
  1145. await self.services.management.collections_overview(
  1146. offset=offset,
  1147. limit=limit,
  1148. document_ids=[UUID(id)], # Convert string ID to UUID
  1149. )
  1150. )
  1151. return collections_response["results"], { # type: ignore
  1152. "total_entries": collections_response["total_entries"]
  1153. }
  1154. @self.router.post(
  1155. "/documents/{id}/extract",
  1156. dependencies=[Depends(self.rate_limit_dependency)],
  1157. summary="Extract entities and relationships",
  1158. openapi_extra={
  1159. "x-codeSamples": [
  1160. {
  1161. "lang": "Python",
  1162. "source": textwrap.dedent(
  1163. """
  1164. from r2r import R2RClient
  1165. client = R2RClient("http://localhost:7272")
  1166. # when using auth, do client.login(...)
  1167. response = client.documents.extract(
  1168. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  1169. )
  1170. """
  1171. ),
  1172. },
  1173. ],
  1174. },
  1175. )
  1176. @self.base_endpoint
  1177. async def extract(
  1178. id: UUID = Path(
  1179. ...,
  1180. description="The ID of the document to extract entities and relationships from.",
  1181. ),
  1182. run_type: KGRunType = Body(
  1183. default=KGRunType.RUN,
  1184. description="Whether to return an estimate of the creation cost or to actually extract the document.",
  1185. ),
  1186. settings: Optional[KGCreationSettings] = Body(
  1187. default=None,
  1188. description="Settings for the entities and relationships extraction process.",
  1189. ),
  1190. run_with_orchestration: Optional[bool] = Body(
  1191. default=True,
  1192. description="Whether to run the entities and relationships extraction process with orchestration.",
  1193. ),
  1194. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1195. ) -> WrappedGenericMessageResponse:
  1196. """
  1197. Extracts entities and relationships from a document.
  1198. The entities and relationships extraction process involves:
  1199. 1. Parsing documents into semantic chunks
  1200. 2. Extracting entities and relationships using LLMs
  1201. 3. Storing the created entities and relationships in the knowledge graph
  1202. 4. Preserving the document's metadata and content, and associating the elements with collections the document belongs to
  1203. """
  1204. settings = settings.dict() if settings else None # type: ignore
  1205. if not auth_user.is_superuser:
  1206. raise R2RException(
  1207. "Only a superuser can extract entities and relationships from a document.",
  1208. 403,
  1209. )
  1210. # If no run type is provided, default to estimate
  1211. if not run_type:
  1212. run_type = KGRunType.ESTIMATE
  1213. # Apply runtime settings overrides
  1214. server_graph_creation_settings = (
  1215. self.providers.database.config.graph_creation_settings
  1216. )
  1217. if settings:
  1218. server_graph_creation_settings = update_settings_from_dict(
  1219. server_settings=server_graph_creation_settings,
  1220. settings_dict=settings, # type: ignore
  1221. )
  1222. if run_type is KGRunType.ESTIMATE:
  1223. return { # type: ignore
  1224. "message": "Estimate retrieved successfully",
  1225. "task_id": None,
  1226. "id": id,
  1227. "estimate": await self.services.graph.get_creation_estimate(
  1228. document_id=id,
  1229. graph_creation_settings=server_graph_creation_settings,
  1230. ),
  1231. }
  1232. if run_with_orchestration:
  1233. workflow_input = {
  1234. "document_id": str(id),
  1235. "graph_creation_settings": server_graph_creation_settings.model_dump_json(),
  1236. "user": auth_user.json(),
  1237. }
  1238. return await self.providers.orchestration.run_workflow(
  1239. "extract-triples", {"request": workflow_input}, {}
  1240. )
  1241. else:
  1242. from core.main.orchestration import simple_kg_factory
  1243. logger.info("Running extract-triples without orchestration.")
  1244. simple_kg = simple_kg_factory(self.services.graph)
  1245. await simple_kg["extract-triples"](workflow_input)
  1246. return { # type: ignore
  1247. "message": "Graph created successfully.",
  1248. "task_id": None,
  1249. }
  1250. @self.router.get(
  1251. "/documents/{id}/entities",
  1252. dependencies=[Depends(self.rate_limit_dependency)],
  1253. summary="Lists the entities from the document",
  1254. openapi_extra={
  1255. "x-codeSamples": [
  1256. {
  1257. "lang": "Python",
  1258. "source": textwrap.dedent(
  1259. """
  1260. from r2r import R2RClient
  1261. client = R2RClient("http://localhost:7272")
  1262. # when using auth, do client.login(...)
  1263. response = client.documents.extract(
  1264. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  1265. )
  1266. """
  1267. ),
  1268. },
  1269. ],
  1270. },
  1271. )
  1272. @self.base_endpoint
  1273. async def get_entities(
  1274. id: UUID = Path(
  1275. ...,
  1276. description="The ID of the document to retrieve entities from.",
  1277. ),
  1278. offset: int = Query(
  1279. 0,
  1280. ge=0,
  1281. description="Specifies the number of objects to skip. Defaults to 0.",
  1282. ),
  1283. limit: int = Query(
  1284. 100,
  1285. ge=1,
  1286. le=1000,
  1287. description="Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.",
  1288. ),
  1289. include_embeddings: Optional[bool] = Query(
  1290. False,
  1291. description="Whether to include vector embeddings in the response.",
  1292. ),
  1293. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1294. ) -> WrappedEntitiesResponse:
  1295. """
  1296. Retrieves the entities that were extracted from a document. These represent
  1297. important semantic elements like people, places, organizations, concepts, etc.
  1298. Users can only access entities from documents they own or have access to through
  1299. collections. Entity embeddings are only included if specifically requested.
  1300. Results are returned in the order they were extracted from the document.
  1301. """
  1302. # if (
  1303. # not auth_user.is_superuser
  1304. # and id not in auth_user.collection_ids
  1305. # ):
  1306. # raise R2RException(
  1307. # "The currently authenticated user does not have access to the specified collection.",
  1308. # 403,
  1309. # )
  1310. # First check if the document exists and user has access
  1311. documents_overview_response = (
  1312. await self.services.management.documents_overview(
  1313. user_ids=(
  1314. None if auth_user.is_superuser else [auth_user.id]
  1315. ),
  1316. collection_ids=(
  1317. None
  1318. if auth_user.is_superuser
  1319. else auth_user.collection_ids
  1320. ),
  1321. document_ids=[id],
  1322. offset=0,
  1323. limit=1,
  1324. )
  1325. )
  1326. if not documents_overview_response["results"]:
  1327. raise R2RException("Document not found.", 404)
  1328. # Get all entities for this document from the document_entity table
  1329. entities, count = (
  1330. await self.providers.database.graphs_handler.entities.get(
  1331. parent_id=id,
  1332. store_type="documents",
  1333. offset=offset,
  1334. limit=limit,
  1335. include_embeddings=include_embeddings,
  1336. )
  1337. )
  1338. return entities, {"total_entries": count} # type: ignore
  1339. @self.router.get(
  1340. "/documents/{id}/relationships",
  1341. dependencies=[Depends(self.rate_limit_dependency)],
  1342. summary="List document relationships",
  1343. openapi_extra={
  1344. "x-codeSamples": [
  1345. {
  1346. "lang": "Python",
  1347. "source": textwrap.dedent(
  1348. """
  1349. from r2r import R2RClient
  1350. client = R2RClient("http://localhost:7272")
  1351. # when using auth, do client.login(...)
  1352. response = client.documents.list_relationships(
  1353. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1354. offset=0,
  1355. limit=100
  1356. )
  1357. """
  1358. ),
  1359. },
  1360. {
  1361. "lang": "JavaScript",
  1362. "source": textwrap.dedent(
  1363. """
  1364. const { r2rClient } = require("r2r-js");
  1365. const client = new r2rClient("http://localhost:7272");
  1366. function main() {
  1367. const response = await client.documents.listRelationships({
  1368. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1369. offset: 0,
  1370. limit: 100,
  1371. });
  1372. }
  1373. main();
  1374. """
  1375. ),
  1376. },
  1377. {
  1378. "lang": "CLI",
  1379. "source": textwrap.dedent(
  1380. """
  1381. r2r documents list-relationships b4ac4dd6-5f27-596e-a55b-7cf242ca30aa
  1382. """
  1383. ),
  1384. },
  1385. {
  1386. "lang": "cURL",
  1387. "source": textwrap.dedent(
  1388. """
  1389. curl -X GET "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa/relationships" \\
  1390. -H "Authorization: Bearer YOUR_API_KEY"
  1391. """
  1392. ),
  1393. },
  1394. ]
  1395. },
  1396. )
  1397. @self.base_endpoint
  1398. async def get_relationships(
  1399. id: UUID = Path(
  1400. ...,
  1401. description="The ID of the document to retrieve relationships for.",
  1402. ),
  1403. offset: int = Query(
  1404. 0,
  1405. ge=0,
  1406. description="Specifies the number of objects to skip. Defaults to 0.",
  1407. ),
  1408. limit: int = Query(
  1409. 100,
  1410. ge=1,
  1411. le=1000,
  1412. description="Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.",
  1413. ),
  1414. entity_names: Optional[list[str]] = Query(
  1415. None,
  1416. description="Filter relationships by specific entity names.",
  1417. ),
  1418. relationship_types: Optional[list[str]] = Query(
  1419. None,
  1420. description="Filter relationships by specific relationship types.",
  1421. ),
  1422. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1423. ) -> WrappedRelationshipsResponse:
  1424. """
  1425. Retrieves the relationships between entities that were extracted from a document. These represent
  1426. connections and interactions between entities found in the text.
  1427. Users can only access relationships from documents they own or have access to through
  1428. collections. Results can be filtered by entity names and relationship types.
  1429. Results are returned in the order they were extracted from the document.
  1430. """
  1431. # if (
  1432. # not auth_user.is_superuser
  1433. # and id not in auth_user.collection_ids
  1434. # ):
  1435. # raise R2RException(
  1436. # "The currently authenticated user does not have access to the specified collection.",
  1437. # 403,
  1438. # )
  1439. # First check if the document exists and user has access
  1440. documents_overview_response = (
  1441. await self.services.management.documents_overview(
  1442. user_ids=(
  1443. None if auth_user.is_superuser else [auth_user.id]
  1444. ),
  1445. collection_ids=(
  1446. None
  1447. if auth_user.is_superuser
  1448. else auth_user.collection_ids
  1449. ),
  1450. document_ids=[id],
  1451. offset=0,
  1452. limit=1,
  1453. )
  1454. )
  1455. if not documents_overview_response["results"]:
  1456. raise R2RException("Document not found.", 404)
  1457. # Get relationships for this document
  1458. relationships, count = (
  1459. await self.providers.database.graphs_handler.relationships.get(
  1460. parent_id=id,
  1461. store_type="documents",
  1462. entity_names=entity_names,
  1463. relationship_types=relationship_types,
  1464. offset=offset,
  1465. limit=limit,
  1466. )
  1467. )
  1468. return relationships, {"total_entries": count} # type: ignore
  1469. @self.router.post(
  1470. "/documents/search",
  1471. dependencies=[Depends(self.rate_limit_dependency)],
  1472. summary="Search document summaries",
  1473. )
  1474. @self.base_endpoint
  1475. async def search_documents(
  1476. query: str = Body(
  1477. ...,
  1478. description="The search query to perform.",
  1479. ),
  1480. search_mode: SearchMode = Body(
  1481. default=SearchMode.custom,
  1482. description=(
  1483. "Default value of `custom` allows full control over search settings.\n\n"
  1484. "Pre-configured search modes:\n"
  1485. "`basic`: A simple semantic-based search.\n"
  1486. "`advanced`: A more powerful hybrid search combining semantic and full-text.\n"
  1487. "`custom`: Full control via `search_settings`.\n\n"
  1488. "If `filters` or `limit` are provided alongside `basic` or `advanced`, "
  1489. "they will override the default settings for that mode."
  1490. ),
  1491. ),
  1492. search_settings: SearchSettings = Body(
  1493. default_factory=SearchSettings,
  1494. description="Settings for document search",
  1495. ),
  1496. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1497. ): # -> WrappedDocumentSearchResponse: # type: ignore
  1498. """
  1499. Perform a search query on the automatically generated document summaries in the system.
  1500. This endpoint allows for complex filtering of search results using PostgreSQL-based queries.
  1501. Filters can be applied to various fields such as document_id, and internal metadata values.
  1502. Allowed operators include `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `like`, `ilike`, `in`, and `nin`.
  1503. """
  1504. effective_settings = self._prepare_search_settings(
  1505. auth_user, search_mode, search_settings
  1506. )
  1507. query_embedding = (
  1508. await self.providers.embedding.async_get_embedding(query)
  1509. )
  1510. results = await self.services.retrieval.search_documents(
  1511. query=query,
  1512. query_embedding=query_embedding,
  1513. settings=effective_settings,
  1514. )
  1515. return results
  1516. @staticmethod
  1517. async def _process_file(file):
  1518. import base64
  1519. content = await file.read()
  1520. return {
  1521. "filename": file.filename,
  1522. "content": base64.b64encode(content).decode("utf-8"),
  1523. "content_type": file.content_type,
  1524. }