documents_router.py 93 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353
  1. import base64
  2. import logging
  3. import mimetypes
  4. import textwrap
  5. from datetime import datetime
  6. from io import BytesIO
  7. from typing import Any, Optional
  8. from urllib.parse import quote
  9. from uuid import UUID
  10. from fastapi import Body, Depends, File, Form, Path, Query, UploadFile
  11. from fastapi.background import BackgroundTasks
  12. from fastapi.responses import FileResponse, StreamingResponse
  13. from pydantic import Json
  14. from core.base import (
  15. IngestionConfig,
  16. R2RException,
  17. SearchMode,
  18. SearchSettings,
  19. UnprocessedChunk,
  20. Workflow,
  21. generate_document_id,
  22. generate_id,
  23. select_search_filters,
  24. )
  25. from core.base.abstractions import GraphCreationSettings, StoreType
  26. from core.base.api.models import (
  27. GenericBooleanResponse,
  28. WrappedBooleanResponse,
  29. WrappedChunksResponse,
  30. WrappedCollectionsResponse,
  31. WrappedDocumentResponse,
  32. WrappedDocumentSearchResponse,
  33. WrappedDocumentsResponse,
  34. WrappedEntitiesResponse,
  35. WrappedGenericMessageResponse,
  36. WrappedIngestionResponse,
  37. WrappedRelationshipsResponse,
  38. )
  39. from core.utils import update_settings_from_dict
  40. from shared.abstractions import IngestionMode
  41. from ...abstractions import R2RProviders, R2RServices
  42. from ...config import R2RConfig
  43. from .base_router import BaseRouterV3
  44. logger = logging.getLogger()
  45. MAX_CHUNKS_PER_REQUEST = 1024 * 100
  46. def merge_search_settings(
  47. base: SearchSettings, overrides: SearchSettings
  48. ) -> SearchSettings:
  49. # Convert both to dict
  50. base_dict = base.model_dump()
  51. overrides_dict = overrides.model_dump(exclude_unset=True)
  52. # Update base_dict with values from overrides_dict
  53. # This ensures that any field set in overrides takes precedence
  54. for k, v in overrides_dict.items():
  55. base_dict[k] = v
  56. # Construct a new SearchSettings from the merged dict
  57. return SearchSettings(**base_dict)
  58. def merge_ingestion_config(
  59. base: IngestionConfig, overrides: IngestionConfig
  60. ) -> IngestionConfig:
  61. base_dict = base.model_dump()
  62. overrides_dict = overrides.model_dump(exclude_unset=True)
  63. for k, v in overrides_dict.items():
  64. base_dict[k] = v
  65. return IngestionConfig(**base_dict)
  66. class DocumentsRouter(BaseRouterV3):
  67. def __init__(
  68. self,
  69. providers: R2RProviders,
  70. services: R2RServices,
  71. config: R2RConfig,
  72. ):
  73. logging.info("Initializing DocumentsRouter")
  74. super().__init__(providers, services, config)
  75. self._register_workflows()
  76. def _prepare_search_settings(
  77. self,
  78. auth_user: Any,
  79. search_mode: SearchMode,
  80. search_settings: Optional[SearchSettings],
  81. ) -> SearchSettings:
  82. """Prepare the effective search settings based on the provided
  83. search_mode, optional user-overrides in search_settings, and applied
  84. filters."""
  85. if search_mode != SearchMode.custom:
  86. # Start from mode defaults
  87. effective_settings = SearchSettings.get_default(search_mode.value)
  88. if search_settings:
  89. # Merge user-provided overrides
  90. effective_settings = merge_search_settings(
  91. effective_settings, search_settings
  92. )
  93. else:
  94. # Custom mode: use provided settings or defaults
  95. effective_settings = search_settings or SearchSettings()
  96. # Apply user-specific filters
  97. effective_settings.filters = select_search_filters(
  98. auth_user, effective_settings
  99. )
  100. return effective_settings
  101. # TODO - Remove this legacy method
  102. def _register_workflows(self):
  103. print(self.providers.orchestration.config.provider)
  104. self.providers.orchestration.register_workflows(
  105. Workflow.INGESTION,
  106. self.services.ingestion,
  107. {
  108. "ingest-files": (
  109. #"Ingest files task queued successfully."
  110. "Document created and ingested successfully."
  111. if self.providers.orchestration.config.provider != "simple"
  112. else "Document created and ingested successfully."
  113. ),
  114. "ingest-chunks": (
  115. #"Ingest chunks task queued successfully."
  116. "Document created and ingested successfully."
  117. if self.providers.orchestration.config.provider != "simple"
  118. else "Document created and ingested successfully."
  119. ),
  120. "update-chunk": (
  121. #"Update chunk task queued successfully."
  122. "Chunk update completed successfully."
  123. if self.providers.orchestration.config.provider != "simple"
  124. else "Chunk update completed successfully."
  125. ),
  126. "update-document-metadata": (
  127. #"Update document metadata task queued successfully."
  128. "Document metadata update completed successfully."
  129. if self.providers.orchestration.config.provider != "simple"
  130. else "Document metadata update completed successfully."
  131. ),
  132. "create-vector-index": (
  133. #"Vector index creation task queued successfully."
  134. "Vector index creation task completed successfully."
  135. if self.providers.orchestration.config.provider != "simple"
  136. else "Vector index creation task completed successfully."
  137. ),
  138. "delete-vector-index": (
  139. #"Vector index deletion task queued successfully."
  140. "Vector index deletion task completed successfully."
  141. if self.providers.orchestration.config.provider != "simple"
  142. else "Vector index deletion task completed successfully."
  143. ),
  144. "select-vector-index": (
  145. #"Vector index selection task queued successfully."
  146. "Vector index selection task completed successfully."
  147. if self.providers.orchestration.config.provider != "simple"
  148. else "Vector index selection task completed successfully."
  149. ),
  150. },
  151. )
  152. def _prepare_ingestion_config(
  153. self,
  154. ingestion_mode: IngestionMode,
  155. ingestion_config: Optional[IngestionConfig],
  156. ) -> IngestionConfig:
  157. # If not custom, start from defaults
  158. if ingestion_mode != IngestionMode.custom:
  159. effective_config = IngestionConfig.get_default(
  160. ingestion_mode.value, app=self.providers.auth.config.app
  161. )
  162. if ingestion_config:
  163. effective_config = merge_ingestion_config(
  164. effective_config, ingestion_config
  165. )
  166. else:
  167. effective_config = ingestion_config or IngestionConfig(
  168. app=self.providers.auth.config.app
  169. )
  170. effective_config.validate_config()
  171. return effective_config
  172. def _setup_routes(self):
  173. @self.router.post(
  174. "/documents",
  175. dependencies=[Depends(self.rate_limit_dependency)],
  176. status_code=202,
  177. summary="Create a new document",
  178. openapi_extra={
  179. "x-codeSamples": [
  180. {
  181. "lang": "Python",
  182. "source": textwrap.dedent("""
  183. from r2r import R2RClient
  184. client = R2RClient()
  185. # when using auth, do client.login(...)
  186. response = client.documents.create(
  187. file_path="pg_essay_1.html",
  188. metadata={"metadata_1":"some random metadata"},
  189. id=None
  190. )
  191. """),
  192. },
  193. {
  194. "lang": "JavaScript",
  195. "source": textwrap.dedent("""
  196. const { r2rClient } = require("r2r-js");
  197. const client = new r2rClient();
  198. function main() {
  199. const response = await client.documents.create({
  200. file: { path: "examples/data/marmeladov.txt", name: "marmeladov.txt" },
  201. metadata: { title: "marmeladov.txt" },
  202. });
  203. }
  204. main();
  205. """),
  206. },
  207. {
  208. "lang": "cURL",
  209. "source": textwrap.dedent("""
  210. curl -X POST "https://api.example.com/v3/documents" \\
  211. -H "Content-Type: multipart/form-data" \\
  212. -H "Authorization: Bearer YOUR_API_KEY" \\
  213. -F "file=@pg_essay_1.html;type=text/html" \\
  214. -F 'metadata={}' \\
  215. -F 'id=null'
  216. """),
  217. },
  218. ]
  219. },
  220. )
  221. @self.base_endpoint
  222. async def create_document(
  223. file: Optional[UploadFile] = File(
  224. None,
  225. description="The file to ingest. Exactly one of file, raw_text, or chunks must be provided.",
  226. ),
  227. raw_text: Optional[str] = Form(
  228. None,
  229. description="Raw text content to ingest. Exactly one of file, raw_text, or chunks must be provided.",
  230. ),
  231. chunks: Optional[Json[list[str]]] = Form(
  232. None,
  233. description="Pre-processed text chunks to ingest. Exactly one of file, raw_text, or chunks must be provided.",
  234. ),
  235. id: Optional[UUID] = Form(
  236. None,
  237. description="The ID of the document. If not provided, a new ID will be generated.",
  238. ),
  239. collection_ids: Optional[Json[list[UUID]]] = Form(
  240. None,
  241. description="Collection IDs to associate with the document. If none are provided, the document will be assigned to the user's default collection.",
  242. ),
  243. metadata: Optional[Json[dict]] = Form(
  244. None,
  245. description="Metadata to associate with the document, such as title, description, or custom fields.",
  246. ),
  247. ingestion_mode: IngestionMode = Form(
  248. default=IngestionMode.custom,
  249. description=(
  250. "Ingestion modes:\n"
  251. "- `hi-res`: Thorough ingestion with full summaries and enrichment.\n"
  252. "- `ocr`: OCR via Mistral and full summaries.\n"
  253. "- `fast`: Quick ingestion with minimal enrichment and no summaries.\n"
  254. "- `custom`: Full control via `ingestion_config`.\n\n"
  255. "If `filters` or `limit` (in `ingestion_config`) are provided alongside `hi-res` or `fast`, "
  256. "they will override the default settings for that mode."
  257. ),
  258. ),
  259. ingestion_config: Optional[Json[IngestionConfig]] = Form(
  260. None,
  261. description="An optional dictionary to override the default chunking configuration for the ingestion process. If not provided, the system will use the default server-side chunking configuration.",
  262. ),
  263. run_with_orchestration: Optional[bool] = Form(
  264. True,
  265. description="Whether or not ingestion runs with orchestration, default is `True`. When set to `False`, the ingestion process will run synchronous and directly return the result.",
  266. ),
  267. auth_user=Depends(self.providers.auth.auth_wrapper()),
  268. ) -> WrappedIngestionResponse:
  269. """
  270. Creates a new Document object from an input file, text content, or chunks. The chosen `ingestion_mode` determines
  271. how the ingestion process is configured:
  272. **Ingestion Modes:**
  273. - `hi-res`: Comprehensive parsing and enrichment, including summaries and possibly more thorough parsing.
  274. - `fast`: Speed-focused ingestion that skips certain enrichment steps like summaries.
  275. - `custom`: Provide a full `ingestion_config` to customize the entire ingestion process.
  276. Either a file or text content must be provided, but not both. Documents are shared through `Collections` which allow for tightly specified cross-user interactions.
  277. The ingestion process runs asynchronously and its progress can be tracked using the returned
  278. task_id.
  279. """
  280. if not auth_user.is_superuser:
  281. user_document_count = (
  282. await self.services.management.documents_overview(
  283. user_ids=[auth_user.id],
  284. offset=0,
  285. limit=1,
  286. )
  287. )["total_entries"]
  288. user_max_documents = (
  289. await self.services.management.get_user_max_documents(
  290. auth_user.id
  291. )
  292. )
  293. if user_document_count >= user_max_documents:
  294. raise R2RException(
  295. status_code=403,
  296. message=f"User has reached the maximum number of documents allowed ({user_max_documents}).",
  297. )
  298. # Get chunks using the vector handler's list_chunks method
  299. user_chunk_count = 0
  300. '''
  301. (
  302. await self.services.ingestion.list_chunks(
  303. filters={"owner_id": {"$eq": str(auth_user.id)}},
  304. offset=0,
  305. limit=1,
  306. )
  307. )["total_entries"]
  308. '''
  309. user_max_chunks = (
  310. await self.services.management.get_user_max_chunks(
  311. auth_user.id
  312. )
  313. )
  314. if user_chunk_count >= user_max_chunks:
  315. raise R2RException(
  316. status_code=403,
  317. message=f"User has reached the maximum number of chunks allowed ({user_max_chunks}).",
  318. )
  319. user_collections_count = (
  320. await self.services.management.collections_overview(
  321. user_ids=[auth_user.id],
  322. offset=0,
  323. limit=1,
  324. )
  325. )["total_entries"]
  326. user_max_collections = (
  327. await self.services.management.get_user_max_collections(
  328. auth_user.id
  329. )
  330. )
  331. if user_collections_count >= user_max_collections: # type: ignore
  332. raise R2RException(
  333. status_code=403,
  334. message=f"User has reached the maximum number of collections allowed ({user_max_collections}).",
  335. )
  336. effective_ingestion_config = self._prepare_ingestion_config(
  337. ingestion_mode=ingestion_mode,
  338. ingestion_config=ingestion_config,
  339. )
  340. if not file and not raw_text and not chunks:
  341. raise R2RException(
  342. status_code=422,
  343. message="Either a `file`, `raw_text`, or `chunks` must be provided.",
  344. )
  345. if (
  346. (file and raw_text)
  347. or (file and chunks)
  348. or (raw_text and chunks)
  349. ):
  350. raise R2RException(
  351. status_code=422,
  352. message="Only one of `file`, `raw_text`, or `chunks` may be provided.",
  353. )
  354. # Check if the user is a superuser
  355. metadata = metadata or {}
  356. if chunks:
  357. if len(chunks) == 0:
  358. raise R2RException("Empty list of chunks provided", 400)
  359. if len(chunks) > MAX_CHUNKS_PER_REQUEST:
  360. raise R2RException(
  361. f"Maximum of {MAX_CHUNKS_PER_REQUEST} chunks per request",
  362. 400,
  363. )
  364. document_id = id or generate_document_id(
  365. "".join(chunks), auth_user.id
  366. )
  367. # FIXME: Metadata doesn't seem to be getting passed through
  368. raw_chunks_for_doc = [
  369. UnprocessedChunk(
  370. text=chunk,
  371. metadata=metadata,
  372. id=generate_id(),
  373. )
  374. for chunk in chunks
  375. ]
  376. # Prepare workflow input
  377. workflow_input = {
  378. "document_id": str(document_id),
  379. "chunks": [
  380. chunk.model_dump(mode="json")
  381. for chunk in raw_chunks_for_doc
  382. ],
  383. "metadata": metadata, # Base metadata for the document
  384. "user": auth_user.model_dump_json(),
  385. "ingestion_config": effective_ingestion_config.model_dump(
  386. mode="json"
  387. ),
  388. }
  389. if run_with_orchestration:
  390. try:
  391. # Run ingestion with orchestration
  392. raw_message = (
  393. await self.providers.orchestration.run_workflow(
  394. "ingest-chunks",
  395. {"request": workflow_input},
  396. options={
  397. "additional_metadata": {
  398. "document_id": str(document_id),
  399. }
  400. },
  401. )
  402. )
  403. raw_message["document_id"] = str(document_id)
  404. return raw_message # type: ignore
  405. except Exception as e: # TODO: Need to find specific errors that we should be excepting (gRPC most likely?)
  406. logger.error(
  407. f"Error running orchestrated ingestion: {e} \n\nAttempting to run without orchestration."
  408. )
  409. logger.info("Running chunk ingestion without orchestration.")
  410. from core.main.orchestration import simple_ingestion_factory
  411. simple_ingestor = simple_ingestion_factory(
  412. self.services.ingestion
  413. )
  414. await simple_ingestor["ingest-chunks"](workflow_input)
  415. return { # type: ignore
  416. "message": "Document created and ingested successfully.",
  417. "document_id": str(document_id),
  418. "task_id": None,
  419. }
  420. else:
  421. if file:
  422. file_data = await self._process_file(file)
  423. if not file.filename:
  424. raise R2RException(
  425. status_code=422,
  426. message="Uploaded file must have a filename.",
  427. )
  428. file_ext = file.filename.split(".")[
  429. -1
  430. ] # e.g. "pdf", "txt"
  431. max_allowed_size = await self.services.management.get_max_upload_size_by_type(
  432. user_id=auth_user.id, file_type_or_ext=file_ext
  433. )
  434. content_length = file_data["content_length"]
  435. if content_length > max_allowed_size:
  436. raise R2RException(
  437. status_code=413, # HTTP 413: Payload Too Large
  438. message=(
  439. f"File size exceeds maximum of {max_allowed_size} bytes "
  440. f"for extension '{file_ext}'."
  441. ),
  442. )
  443. file_content = BytesIO(
  444. base64.b64decode(file_data["content"])
  445. )
  446. file_data.pop("content", None)
  447. document_id = id or generate_document_id(
  448. file_data["filename"], auth_user.id
  449. )
  450. elif raw_text:
  451. content_length = len(raw_text)
  452. file_content = BytesIO(raw_text.encode("utf-8"))
  453. document_id = id or generate_document_id(
  454. raw_text, auth_user.id
  455. )
  456. title = metadata.get("title", None)
  457. title = title + ".txt" if title else None
  458. file_data = {
  459. "filename": title or "N/A",
  460. "content_type": "text/plain",
  461. }
  462. else:
  463. raise R2RException(
  464. status_code=422,
  465. message="Either a file or content must be provided.",
  466. )
  467. workflow_input = {
  468. "file_data": file_data,
  469. "document_id": str(document_id),
  470. "collection_ids": (
  471. [str(cid) for cid in collection_ids]
  472. if collection_ids
  473. else None
  474. ),
  475. "metadata": metadata,
  476. "ingestion_config": effective_ingestion_config.model_dump(
  477. mode="json"
  478. ),
  479. "user": auth_user.model_dump_json(),
  480. "size_in_bytes": content_length,
  481. "version": "v0",
  482. }
  483. file_name = file_data["filename"]
  484. await self.providers.database.files_handler.store_file(
  485. document_id,
  486. file_name,
  487. file_content,
  488. file_data["content_type"],
  489. )
  490. await self.services.ingestion.ingest_file_ingress(
  491. file_data=workflow_input["file_data"],
  492. user=auth_user,
  493. document_id=workflow_input["document_id"],
  494. size_in_bytes=workflow_input["size_in_bytes"],
  495. metadata=workflow_input["metadata"],
  496. version=workflow_input["version"],
  497. )
  498. if run_with_orchestration:
  499. try:
  500. # TODO - Modify create_chunks so that we can add chunks to existing document
  501. workflow_result: dict[
  502. str, str | None
  503. ] = await self.providers.orchestration.run_workflow( # type: ignore
  504. "ingest-files",
  505. {"request": workflow_input},
  506. options={
  507. "additional_metadata": {
  508. "document_id": str(document_id),
  509. }
  510. },
  511. )
  512. workflow_result["document_id"] = str(document_id)
  513. return workflow_result # type: ignore
  514. except Exception as e: # TODO: Need to find specific error (gRPC most likely?)
  515. logger.error(
  516. f"Error running orchestrated ingestion: {e} \n\nAttempting to run without orchestration."
  517. )
  518. logger.info(
  519. f"Running ingestion without orchestration for file {file_name} and document_id {document_id}."
  520. )
  521. # TODO - Clean up implementation logic here to be more explicitly `synchronous`
  522. from core.main.orchestration import simple_ingestion_factory
  523. simple_ingestor = simple_ingestion_factory(self.services.ingestion)
  524. await simple_ingestor["ingest-files"](workflow_input)
  525. return { # type: ignore
  526. "message": "Document created and ingested successfully.",
  527. "document_id": str(document_id),
  528. "task_id": None,
  529. }
  530. @self.router.patch(
  531. "/documents/{id}/metadata",
  532. dependencies=[Depends(self.rate_limit_dependency)],
  533. summary="Append metadata to a document",
  534. openapi_extra={
  535. "x-codeSamples": [
  536. {
  537. "lang": "Python",
  538. "source": textwrap.dedent("""
  539. from r2r import R2RClient
  540. client = R2RClient()
  541. # when using auth, do client.login(...)
  542. response = client.documents.append_metadata(
  543. id="9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
  544. metadata=[{"key": "new_key", "value": "new_value"}]
  545. )
  546. """),
  547. },
  548. {
  549. "lang": "JavaScript",
  550. "source": textwrap.dedent("""
  551. const { r2rClient } = require("r2r-js");
  552. const client = new r2rClient();
  553. function main() {
  554. const response = await client.documents.appendMetadata({
  555. id: "9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
  556. metadata: [{ key: "new_key", value: "new_value" }],
  557. });
  558. }
  559. main();
  560. """),
  561. },
  562. ]
  563. },
  564. )
  565. @self.base_endpoint
  566. async def patch_metadata(
  567. id: UUID = Path(
  568. ...,
  569. description="The ID of the document to append metadata to.",
  570. ),
  571. metadata: list[dict] = Body(
  572. ...,
  573. description="Metadata to append to the document.",
  574. ),
  575. auth_user=Depends(self.providers.auth.auth_wrapper()),
  576. ) -> WrappedDocumentResponse:
  577. """Appends metadata to a document. This endpoint allows adding new metadata fields or updating existing ones."""
  578. request_user_ids = (
  579. None if auth_user.is_superuser else [auth_user.id]
  580. )
  581. documents_overview_response = (
  582. await self.services.management.documents_overview(
  583. user_ids=request_user_ids,
  584. document_ids=[id],
  585. offset=0,
  586. limit=1,
  587. )
  588. )
  589. results = documents_overview_response["results"]
  590. if len(results) == 0:
  591. raise R2RException("Document not found.", 404)
  592. return await self.services.management.update_document_metadata(
  593. document_id=id,
  594. metadata=metadata,
  595. overwrite=False,
  596. )
  597. @self.router.put(
  598. "/documents/{id}/metadata",
  599. dependencies=[Depends(self.rate_limit_dependency)],
  600. summary="Replace metadata of a document",
  601. openapi_extra={
  602. "x-codeSamples": [
  603. {
  604. "lang": "Python",
  605. "source": textwrap.dedent("""
  606. from r2r import R2RClient
  607. client = R2RClient()
  608. # when using auth, do client.login(...)
  609. response = client.documents.replace_metadata(
  610. id="9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
  611. metadata=[{"key": "new_key", "value": "new_value"}]
  612. )
  613. """),
  614. },
  615. {
  616. "lang": "JavaScript",
  617. "source": textwrap.dedent("""
  618. const { r2rClient } = require("r2r-js");
  619. const client = new r2rClient();
  620. function main() {
  621. const response = await client.documents.replaceMetadata({
  622. id: "9fbe403b-c11c-5aae-8ade-ef22980c3ad1",
  623. metadata: [{ key: "new_key", value: "new_value" }],
  624. });
  625. }
  626. main();
  627. """),
  628. },
  629. ]
  630. },
  631. )
  632. @self.base_endpoint
  633. async def put_metadata(
  634. id: UUID = Path(
  635. ...,
  636. description="The ID of the document to append metadata to.",
  637. ),
  638. metadata: list[dict] = Body(
  639. ...,
  640. description="Metadata to append to the document.",
  641. ),
  642. auth_user=Depends(self.providers.auth.auth_wrapper()),
  643. ) -> WrappedDocumentResponse:
  644. """Replaces metadata in a document. This endpoint allows overwriting existing metadata fields."""
  645. request_user_ids = (
  646. None if auth_user.is_superuser else [auth_user.id]
  647. )
  648. documents_overview_response = (
  649. await self.services.management.documents_overview(
  650. user_ids=request_user_ids,
  651. document_ids=[id],
  652. offset=0,
  653. limit=1,
  654. )
  655. )
  656. results = documents_overview_response["results"]
  657. if len(results) == 0:
  658. raise R2RException("Document not found.", 404)
  659. return await self.services.management.update_document_metadata(
  660. document_id=id,
  661. metadata=metadata,
  662. overwrite=True,
  663. )
  664. @self.router.post(
  665. "/documents/export",
  666. summary="Export documents to CSV",
  667. dependencies=[Depends(self.rate_limit_dependency)],
  668. openapi_extra={
  669. "x-codeSamples": [
  670. {
  671. "lang": "Python",
  672. "source": textwrap.dedent("""
  673. from r2r import R2RClient
  674. client = R2RClient("http://localhost:7272")
  675. # when using auth, do client.login(...)
  676. response = client.documents.export(
  677. output_path="export.csv",
  678. columns=["id", "title", "created_at"],
  679. include_header=True,
  680. )
  681. """),
  682. },
  683. {
  684. "lang": "JavaScript",
  685. "source": textwrap.dedent("""
  686. const { r2rClient } = require("r2r-js");
  687. const client = new r2rClient("http://localhost:7272");
  688. function main() {
  689. await client.documents.export({
  690. outputPath: "export.csv",
  691. columns: ["id", "title", "created_at"],
  692. includeHeader: true,
  693. });
  694. }
  695. main();
  696. """),
  697. },
  698. {
  699. "lang": "cURL",
  700. "source": textwrap.dedent("""
  701. curl -X POST "http://127.0.0.1:7272/v3/documents/export" \
  702. -H "Authorization: Bearer YOUR_API_KEY" \
  703. -H "Content-Type: application/json" \
  704. -H "Accept: text/csv" \
  705. -d '{ "columns": ["id", "title", "created_at"], "include_header": true }' \
  706. --output export.csv
  707. """),
  708. },
  709. ]
  710. },
  711. )
  712. @self.base_endpoint
  713. async def export_documents(
  714. background_tasks: BackgroundTasks,
  715. columns: Optional[list[str]] = Body(
  716. None, description="Specific columns to export"
  717. ),
  718. filters: Optional[dict] = Body(
  719. None, description="Filters to apply to the export"
  720. ),
  721. include_header: Optional[bool] = Body(
  722. True, description="Whether to include column headers"
  723. ),
  724. auth_user=Depends(self.providers.auth.auth_wrapper()),
  725. ) -> FileResponse:
  726. """Export documents as a downloadable CSV file."""
  727. if not auth_user.is_superuser:
  728. raise R2RException(
  729. "Only a superuser can export data.",
  730. 403,
  731. )
  732. (
  733. csv_file_path,
  734. temp_file,
  735. ) = await self.services.management.export_documents(
  736. columns=columns,
  737. filters=filters,
  738. include_header=include_header
  739. if include_header is not None
  740. else True,
  741. )
  742. background_tasks.add_task(temp_file.close)
  743. return FileResponse(
  744. path=csv_file_path,
  745. media_type="text/csv",
  746. filename="documents_export.csv",
  747. )
  748. @self.router.get(
  749. "/documents/download_zip",
  750. dependencies=[Depends(self.rate_limit_dependency)],
  751. response_class=StreamingResponse,
  752. summary="Export multiple documents as zip",
  753. openapi_extra={
  754. "x-codeSamples": [
  755. {
  756. "lang": "Python",
  757. "source": textwrap.dedent("""
  758. client.documents.download_zip(
  759. document_ids=["uuid1", "uuid2"],
  760. start_date="2024-01-01",
  761. end_date="2024-12-31"
  762. )
  763. """),
  764. },
  765. {
  766. "lang": "cURL",
  767. "source": textwrap.dedent("""
  768. curl -X GET "https://api.example.com/v3/documents/download_zip?document_ids=uuid1,uuid2&start_date=2024-01-01&end_date=2024-12-31" \\
  769. -H "Authorization: Bearer YOUR_API_KEY"
  770. """),
  771. },
  772. ]
  773. },
  774. )
  775. @self.base_endpoint
  776. async def export_files(
  777. document_ids: Optional[list[UUID]] = Query(
  778. None,
  779. description="List of document IDs to include in the export. If not provided, all accessible documents will be included.",
  780. ),
  781. start_date: Optional[datetime] = Query(
  782. None,
  783. description="Filter documents created on or after this date.",
  784. ),
  785. end_date: Optional[datetime] = Query(
  786. None,
  787. description="Filter documents created before this date.",
  788. ),
  789. auth_user=Depends(self.providers.auth.auth_wrapper()),
  790. ) -> StreamingResponse:
  791. """Export multiple documents as a zip file. Documents can be
  792. filtered by IDs and/or date range.
  793. The endpoint allows downloading:
  794. - Specific documents by providing their IDs
  795. - Documents within a date range
  796. - All accessible documents if no filters are provided
  797. Files are streamed as a zip archive to handle potentially large downloads efficiently.
  798. """
  799. if not auth_user.is_superuser:
  800. # For non-superusers, verify access to requested documents
  801. if document_ids:
  802. documents_overview = (
  803. await self.services.management.documents_overview(
  804. user_ids=[auth_user.id],
  805. document_ids=document_ids,
  806. offset=0,
  807. limit=len(document_ids),
  808. )
  809. )
  810. if len(documents_overview["results"]) != len(document_ids):
  811. raise R2RException(
  812. status_code=403,
  813. message="You don't have access to one or more requested documents.",
  814. )
  815. if not document_ids:
  816. raise R2RException(
  817. status_code=403,
  818. message="Non-superusers must provide document IDs to export.",
  819. )
  820. (
  821. zip_name,
  822. zip_content,
  823. zip_size,
  824. ) = await self.services.management.export_files(
  825. document_ids=document_ids,
  826. start_date=start_date,
  827. end_date=end_date,
  828. )
  829. encoded_filename = quote(zip_name)
  830. async def stream_file():
  831. yield zip_content.getvalue()
  832. return StreamingResponse(
  833. stream_file(),
  834. media_type="application/zip",
  835. headers={
  836. "Content-Disposition": f"attachment; filename*=UTF-8''{encoded_filename}",
  837. "Content-Length": str(zip_size),
  838. },
  839. )
  840. @self.router.get(
  841. "/documents",
  842. dependencies=[Depends(self.rate_limit_dependency)],
  843. summary="List documents",
  844. openapi_extra={
  845. "x-codeSamples": [
  846. {
  847. "lang": "Python",
  848. "source": textwrap.dedent("""
  849. from r2r import R2RClient
  850. client = R2RClient()
  851. # when using auth, do client.login(...)
  852. response = client.documents.list(
  853. limit=10,
  854. offset=0
  855. )
  856. """),
  857. },
  858. {
  859. "lang": "JavaScript",
  860. "source": textwrap.dedent("""
  861. const { r2rClient } = require("r2r-js");
  862. const client = new r2rClient();
  863. function main() {
  864. const response = await client.documents.list({
  865. limit: 10,
  866. offset: 0,
  867. });
  868. }
  869. main();
  870. """),
  871. },
  872. {
  873. "lang": "cURL",
  874. "source": textwrap.dedent("""
  875. curl -X GET "https://api.example.com/v3/documents" \\
  876. -H "Authorization: Bearer YOUR_API_KEY"
  877. """),
  878. },
  879. ]
  880. },
  881. )
  882. @self.base_endpoint
  883. async def get_documents(
  884. ids: list[str] = Query(
  885. [],
  886. description="A list of document IDs to retrieve. If not provided, all documents will be returned.",
  887. ),
  888. offset: int = Query(
  889. 0,
  890. ge=0,
  891. description="Specifies the number of objects to skip. Defaults to 0.",
  892. ),
  893. limit: int = Query(
  894. 100,
  895. ge=1,
  896. le=1000,
  897. description="Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.",
  898. ),
  899. include_summary_embeddings: bool = Query(
  900. False,
  901. description="Specifies whether or not to include embeddings of each document summary.",
  902. ),
  903. auth_user=Depends(self.providers.auth.auth_wrapper()),
  904. ) -> WrappedDocumentsResponse:
  905. """Returns a paginated list of documents the authenticated user has
  906. access to.
  907. Results can be filtered by providing specific document IDs. Regular
  908. users will only see documents they own or have access to through
  909. collections. Superusers can see all documents.
  910. The documents are returned in order of last modification, with most
  911. recent first.
  912. """
  913. requesting_user_id = (
  914. None if auth_user.is_superuser else [auth_user.id]
  915. )
  916. filter_collection_ids = (
  917. None if auth_user.is_superuser else auth_user.collection_ids
  918. )
  919. document_uuids = [UUID(document_id) for document_id in ids]
  920. documents_overview_response = (
  921. await self.services.management.documents_overview(
  922. user_ids=requesting_user_id,
  923. collection_ids=filter_collection_ids,
  924. document_ids=document_uuids,
  925. offset=offset,
  926. limit=limit,
  927. )
  928. )
  929. if not include_summary_embeddings:
  930. for document in documents_overview_response["results"]:
  931. document.summary_embedding = None
  932. return ( # type: ignore
  933. documents_overview_response["results"],
  934. {
  935. "total_entries": documents_overview_response[
  936. "total_entries"
  937. ]
  938. },
  939. )
  940. @self.router.get(
  941. "/documents/{id}",
  942. dependencies=[Depends(self.rate_limit_dependency)],
  943. summary="Retrieve a document",
  944. openapi_extra={
  945. "x-codeSamples": [
  946. {
  947. "lang": "Python",
  948. "source": textwrap.dedent("""
  949. from r2r import R2RClient
  950. client = R2RClient()
  951. # when using auth, do client.login(...)
  952. response = client.documents.retrieve(
  953. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  954. )
  955. """),
  956. },
  957. {
  958. "lang": "JavaScript",
  959. "source": textwrap.dedent("""
  960. const { r2rClient } = require("r2r-js");
  961. const client = new r2rClient();
  962. function main() {
  963. const response = await client.documents.retrieve({
  964. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  965. });
  966. }
  967. main();
  968. """),
  969. },
  970. {
  971. "lang": "cURL",
  972. "source": textwrap.dedent("""
  973. curl -X GET "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa" \\
  974. -H "Authorization: Bearer YOUR_API_KEY"
  975. """),
  976. },
  977. ]
  978. },
  979. )
  980. @self.base_endpoint
  981. async def get_document(
  982. id: UUID = Path(
  983. ...,
  984. description="The ID of the document to retrieve.",
  985. ),
  986. auth_user=Depends(self.providers.auth.auth_wrapper()),
  987. ) -> WrappedDocumentResponse:
  988. """Retrieves detailed information about a specific document by its
  989. ID.
  990. This endpoint returns the document's metadata, status, and system information. It does not
  991. return the document's content - use the `/documents/{id}/download` endpoint for that.
  992. Users can only retrieve documents they own or have access to through collections.
  993. Superusers can retrieve any document.
  994. """
  995. request_user_ids = (
  996. None if auth_user.is_superuser else [auth_user.id]
  997. )
  998. filter_collection_ids = (
  999. None if auth_user.is_superuser else auth_user.collection_ids
  1000. )
  1001. documents_overview_response = await self.services.management.documents_overview( # FIXME: This was using the pagination defaults from before... We need to review if this is as intended.
  1002. user_ids=request_user_ids,
  1003. collection_ids=filter_collection_ids,
  1004. document_ids=[id],
  1005. offset=0,
  1006. limit=100,
  1007. )
  1008. results = documents_overview_response["results"]
  1009. if len(results) == 0:
  1010. raise R2RException("Document not found.", 404)
  1011. return results[0]
  1012. @self.router.get(
  1013. "/documents/{id}/chunks",
  1014. dependencies=[Depends(self.rate_limit_dependency)],
  1015. summary="List document chunks",
  1016. openapi_extra={
  1017. "x-codeSamples": [
  1018. {
  1019. "lang": "Python",
  1020. "source": textwrap.dedent("""
  1021. from r2r import R2RClient
  1022. client = R2RClient()
  1023. # when using auth, do client.login(...)
  1024. response = client.documents.list_chunks(
  1025. id="32b6a70f-a995-5c51-85d2-834f06283a1e"
  1026. )
  1027. """),
  1028. },
  1029. {
  1030. "lang": "JavaScript",
  1031. "source": textwrap.dedent("""
  1032. const { r2rClient } = require("r2r-js");
  1033. const client = new r2rClient();
  1034. function main() {
  1035. const response = await client.documents.listChunks({
  1036. id: "32b6a70f-a995-5c51-85d2-834f06283a1e",
  1037. });
  1038. }
  1039. main();
  1040. """),
  1041. },
  1042. {
  1043. "lang": "cURL",
  1044. "source": textwrap.dedent("""
  1045. curl -X GET "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa/chunks" \\
  1046. -H "Authorization: Bearer YOUR_API_KEY"\
  1047. """),
  1048. },
  1049. ]
  1050. },
  1051. )
  1052. @self.base_endpoint
  1053. async def list_chunks(
  1054. id: UUID = Path(
  1055. ...,
  1056. description="The ID of the document to retrieve chunks for.",
  1057. ),
  1058. offset: int = Query(
  1059. 0,
  1060. ge=0,
  1061. description="Specifies the number of objects to skip. Defaults to 0.",
  1062. ),
  1063. limit: int = Query(
  1064. 100,
  1065. ge=1,
  1066. le=1000,
  1067. description="Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.",
  1068. ),
  1069. include_vectors: Optional[bool] = Query(
  1070. False,
  1071. description="Whether to include vector embeddings in the response.",
  1072. ),
  1073. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1074. ) -> WrappedChunksResponse:
  1075. """Retrieves the text chunks that were generated from a document
  1076. during ingestion. Chunks represent semantic sections of the
  1077. document and are used for retrieval and analysis.
  1078. Users can only access chunks from documents they own or have access
  1079. to through collections. Vector embeddings are only included if
  1080. specifically requested.
  1081. Results are returned in chunk sequence order, representing their
  1082. position in the original document.
  1083. """
  1084. list_document_chunks = (
  1085. await self.services.management.list_document_chunks(
  1086. document_id=id,
  1087. offset=offset,
  1088. limit=limit,
  1089. include_vectors=include_vectors or False,
  1090. )
  1091. )
  1092. if not list_document_chunks["results"]:
  1093. raise R2RException(
  1094. "No chunks found for the given document ID.", 404
  1095. )
  1096. is_owner = str(
  1097. list_document_chunks["results"][0].get("owner_id")
  1098. ) == str(auth_user.id)
  1099. document_collections = (
  1100. await self.services.management.collections_overview(
  1101. offset=0,
  1102. limit=-1,
  1103. document_ids=[id],
  1104. )
  1105. )
  1106. user_has_access = (
  1107. is_owner
  1108. or set(auth_user.collection_ids).intersection(
  1109. {ele.id for ele in document_collections["results"]} # type: ignore
  1110. )
  1111. != set()
  1112. )
  1113. if not user_has_access and not auth_user.is_superuser:
  1114. raise R2RException(
  1115. "Not authorized to access this document's chunks.", 403
  1116. )
  1117. return ( # type: ignore
  1118. list_document_chunks["results"],
  1119. {"total_entries": list_document_chunks["total_entries"]},
  1120. )
  1121. @self.router.get(
  1122. "/documents/{id}/download",
  1123. dependencies=[Depends(self.rate_limit_dependency)],
  1124. response_class=StreamingResponse,
  1125. summary="Download document content",
  1126. openapi_extra={
  1127. "x-codeSamples": [
  1128. {
  1129. "lang": "Python",
  1130. "source": textwrap.dedent("""
  1131. from r2r import R2RClient
  1132. client = R2RClient()
  1133. # when using auth, do client.login(...)
  1134. response = client.documents.download(
  1135. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  1136. )
  1137. """),
  1138. },
  1139. {
  1140. "lang": "JavaScript",
  1141. "source": textwrap.dedent("""
  1142. const { r2rClient } = require("r2r-js");
  1143. const client = new r2rClient();
  1144. function main() {
  1145. const response = await client.documents.download({
  1146. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1147. });
  1148. }
  1149. main();
  1150. """),
  1151. },
  1152. {
  1153. "lang": "cURL",
  1154. "source": textwrap.dedent("""
  1155. curl -X GET "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa/download" \\
  1156. -H "Authorization: Bearer YOUR_API_KEY"
  1157. """),
  1158. },
  1159. ]
  1160. },
  1161. )
  1162. @self.base_endpoint
  1163. async def get_document_file(
  1164. id: str = Path(..., description="Document ID"),
  1165. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1166. ) -> StreamingResponse:
  1167. """Downloads the original file content of a document.
  1168. For uploaded files, returns the original file with its proper MIME
  1169. type. For text-only documents, returns the content as plain text.
  1170. Users can only download documents they own or have access to
  1171. through collections.
  1172. """
  1173. try:
  1174. document_uuid = UUID(id)
  1175. except ValueError:
  1176. raise R2RException(
  1177. status_code=422, message="Invalid document ID format."
  1178. ) from None
  1179. # Retrieve the document's information
  1180. documents_overview_response = (
  1181. await self.services.management.documents_overview(
  1182. user_ids=None,
  1183. collection_ids=None,
  1184. document_ids=[document_uuid],
  1185. offset=0,
  1186. limit=1,
  1187. )
  1188. )
  1189. if not documents_overview_response["results"]:
  1190. raise R2RException("Document not found.", 404)
  1191. document = documents_overview_response["results"][0]
  1192. is_owner = str(document.owner_id) == str(auth_user.id)
  1193. if not auth_user.is_superuser and not is_owner:
  1194. document_collections = (
  1195. await self.services.management.collections_overview(
  1196. offset=0,
  1197. limit=-1,
  1198. document_ids=[document_uuid],
  1199. )
  1200. )
  1201. document_collection_ids = {
  1202. str(ele.id)
  1203. for ele in document_collections["results"] # type: ignore
  1204. }
  1205. user_collection_ids = {
  1206. str(cid) for cid in auth_user.collection_ids
  1207. }
  1208. has_collection_access = user_collection_ids.intersection(
  1209. document_collection_ids
  1210. )
  1211. if not has_collection_access:
  1212. raise R2RException(
  1213. "Not authorized to access this document.", 403
  1214. )
  1215. file_tuple = await self.services.management.download_file(
  1216. document_uuid
  1217. )
  1218. if not file_tuple:
  1219. raise R2RException(status_code=404, message="File not found.")
  1220. file_name, file_content, file_size = file_tuple
  1221. encoded_filename = quote(file_name)
  1222. mime_type, _ = mimetypes.guess_type(file_name)
  1223. if not mime_type:
  1224. mime_type = "application/octet-stream"
  1225. async def file_stream():
  1226. chunk_size = 1024 * 1024 # 1MB
  1227. while True:
  1228. data = file_content.read(chunk_size)
  1229. if not data:
  1230. break
  1231. yield data
  1232. return StreamingResponse(
  1233. file_stream(),
  1234. media_type=mime_type,
  1235. headers={
  1236. "Content-Disposition": f"inline; filename*=UTF-8''{encoded_filename}",
  1237. "Content-Length": str(file_size),
  1238. },
  1239. )
  1240. @self.router.delete(
  1241. "/documents/by-filter",
  1242. dependencies=[Depends(self.rate_limit_dependency)],
  1243. summary="Delete documents by filter",
  1244. openapi_extra={
  1245. "x-codeSamples": [
  1246. {
  1247. "lang": "Python",
  1248. "source": textwrap.dedent("""
  1249. from r2r import R2RClient
  1250. client = R2RClient()
  1251. # when using auth, do client.login(...)
  1252. response = client.documents.delete_by_filter(
  1253. filters={"document_type": {"$eq": "txt"}}
  1254. )
  1255. """),
  1256. },
  1257. {
  1258. "lang": "cURL",
  1259. "source": textwrap.dedent("""
  1260. curl -X DELETE "https://api.example.com/v3/documents/by-filter?filters=%7B%22document_type%22%3A%7B%22%24eq%22%3A%22text%22%7D%2C%22created_at%22%3A%7B%22%24lt%22%3A%222023-01-01T00%3A00%3A00Z%22%7D%7D" \\
  1261. -H "Authorization: Bearer YOUR_API_KEY"
  1262. """),
  1263. },
  1264. ]
  1265. },
  1266. )
  1267. @self.base_endpoint
  1268. async def delete_document_by_filter(
  1269. filters: Json[dict] = Body(
  1270. ..., description="JSON-encoded filters"
  1271. ),
  1272. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1273. ) -> WrappedBooleanResponse:
  1274. """Delete documents based on provided filters.
  1275. Allowed operators
  1276. include: `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `like`,
  1277. `ilike`, `in`, and `nin`. Deletion requests are limited to a
  1278. user's own documents.
  1279. """
  1280. filters_dict = {
  1281. "$and": [{"owner_id": {"$eq": str(auth_user.id)}}, filters]
  1282. }
  1283. await (
  1284. self.services.management.delete_documents_and_chunks_by_filter(
  1285. filters=filters_dict
  1286. )
  1287. )
  1288. return GenericBooleanResponse(success=True) # type: ignore
  1289. @self.router.delete(
  1290. "/documents/{id}",
  1291. dependencies=[Depends(self.rate_limit_dependency)],
  1292. summary="Delete a document",
  1293. openapi_extra={
  1294. "x-codeSamples": [
  1295. {
  1296. "lang": "Python",
  1297. "source": textwrap.dedent("""
  1298. from r2r import R2RClient
  1299. client = R2RClient()
  1300. # when using auth, do client.login(...)
  1301. response = client.documents.delete(
  1302. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  1303. )
  1304. """),
  1305. },
  1306. {
  1307. "lang": "JavaScript",
  1308. "source": textwrap.dedent("""
  1309. const { r2rClient } = require("r2r-js");
  1310. const client = new r2rClient();
  1311. function main() {
  1312. const response = await client.documents.delete({
  1313. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1314. });
  1315. }
  1316. main();
  1317. """),
  1318. },
  1319. {
  1320. "lang": "cURL",
  1321. "source": textwrap.dedent("""
  1322. curl -X DELETE "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa" \\
  1323. -H "Authorization: Bearer YOUR_API_KEY"
  1324. """),
  1325. },
  1326. ]
  1327. },
  1328. )
  1329. @self.base_endpoint
  1330. async def delete_document_by_id(
  1331. id: UUID = Path(..., description="Document ID"),
  1332. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1333. ) -> WrappedBooleanResponse:
  1334. """Delete a specific document. All chunks corresponding to the
  1335. document are deleted, and all other references to the document are
  1336. removed.
  1337. NOTE - Deletions do not yet impact the knowledge graph or other derived data. This feature is planned for a future release.
  1338. """
  1339. filters: dict[str, Any] = {"document_id": {"$eq": str(id)}}
  1340. if not auth_user.is_superuser:
  1341. filters = {
  1342. "$and": [
  1343. {"owner_id": {"$eq": str(auth_user.id)}},
  1344. {"document_id": {"$eq": str(id)}},
  1345. ]
  1346. }
  1347. await (
  1348. self.services.management.delete_documents_and_chunks_by_filter(
  1349. filters=filters
  1350. )
  1351. )
  1352. return GenericBooleanResponse(success=True) # type: ignore
  1353. @self.router.get(
  1354. "/documents/{id}/collections",
  1355. dependencies=[Depends(self.rate_limit_dependency)],
  1356. summary="List document collections",
  1357. openapi_extra={
  1358. "x-codeSamples": [
  1359. {
  1360. "lang": "Python",
  1361. "source": textwrap.dedent("""
  1362. from r2r import R2RClient
  1363. client = R2RClient()
  1364. # when using auth, do client.login(...)
  1365. response = client.documents.list_collections(
  1366. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa", offset=0, limit=10
  1367. )
  1368. """),
  1369. },
  1370. {
  1371. "lang": "JavaScript",
  1372. "source": textwrap.dedent("""
  1373. const { r2rClient } = require("r2r-js");
  1374. const client = new r2rClient();
  1375. function main() {
  1376. const response = await client.documents.listCollections({
  1377. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1378. });
  1379. }
  1380. main();
  1381. """),
  1382. },
  1383. {
  1384. "lang": "cURL",
  1385. "source": textwrap.dedent("""
  1386. curl -X GET "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa/collections" \\
  1387. -H "Authorization: Bearer YOUR_API_KEY"
  1388. """),
  1389. },
  1390. ]
  1391. },
  1392. )
  1393. @self.base_endpoint
  1394. async def get_document_collections(
  1395. id: str = Path(..., description="Document ID"),
  1396. offset: int = Query(
  1397. 0,
  1398. ge=0,
  1399. description="Specifies the number of objects to skip. Defaults to 0.",
  1400. ),
  1401. limit: int = Query(
  1402. 100,
  1403. ge=1,
  1404. le=1000,
  1405. description="Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.",
  1406. ),
  1407. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1408. ) -> WrappedCollectionsResponse:
  1409. """Retrieves all collections that contain the specified document.
  1410. This endpoint is restricted to superusers only and provides a
  1411. system-wide view of document organization.
  1412. Collections are used to organize documents and manage access control. A document can belong
  1413. to multiple collections, and users can access documents through collection membership.
  1414. The results are paginated and ordered by collection creation date, with the most recently
  1415. created collections appearing first.
  1416. NOTE - This endpoint is only available to superusers, it will be extended to regular users in a future release.
  1417. """
  1418. if not auth_user.is_superuser:
  1419. raise R2RException(
  1420. "Only a superuser can get the collections belonging to a document.",
  1421. 403,
  1422. )
  1423. collections_response = (
  1424. await self.services.management.collections_overview(
  1425. offset=offset,
  1426. limit=limit,
  1427. document_ids=[UUID(id)], # Convert string ID to UUID
  1428. )
  1429. )
  1430. return collections_response["results"], { # type: ignore
  1431. "total_entries": collections_response["total_entries"]
  1432. }
  1433. @self.router.post(
  1434. "/documents/{id}/extract",
  1435. dependencies=[Depends(self.rate_limit_dependency)],
  1436. summary="Extract entities and relationships",
  1437. openapi_extra={
  1438. "x-codeSamples": [
  1439. {
  1440. "lang": "Python",
  1441. "source": textwrap.dedent("""
  1442. from r2r import R2RClient
  1443. client = R2RClient()
  1444. # when using auth, do client.login(...)
  1445. response = client.documents.extract(
  1446. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  1447. )
  1448. """),
  1449. },
  1450. ],
  1451. },
  1452. )
  1453. @self.base_endpoint
  1454. async def extract(
  1455. id: UUID = Path(
  1456. ...,
  1457. description="The ID of the document to extract entities and relationships from.",
  1458. ),
  1459. settings: Optional[GraphCreationSettings] = Body(
  1460. default=None,
  1461. description="Settings for the entities and relationships extraction process.",
  1462. ),
  1463. run_with_orchestration: Optional[bool] = Body(
  1464. default=True,
  1465. description="Whether to run the entities and relationships extraction process with orchestration.",
  1466. ),
  1467. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1468. ) -> WrappedGenericMessageResponse:
  1469. """Extracts entities and relationships from a document.
  1470. The entities and relationships extraction process involves:
  1471. 1. Parsing documents into semantic chunks
  1472. 2. Extracting entities and relationships using LLMs
  1473. 3. Storing the created entities and relationships in the knowledge graph
  1474. 4. Preserving the document's metadata and content, and associating the elements with collections the document belongs to
  1475. """
  1476. settings = settings.dict() if settings else None # type: ignore
  1477. documents_overview_response = (
  1478. await self.services.management.documents_overview(
  1479. user_ids=(
  1480. None if auth_user.is_superuser else [auth_user.id]
  1481. ),
  1482. collection_ids=(
  1483. None
  1484. if auth_user.is_superuser
  1485. else auth_user.collection_ids
  1486. ),
  1487. document_ids=[id],
  1488. offset=0,
  1489. limit=1,
  1490. )
  1491. )["results"]
  1492. if len(documents_overview_response) == 0:
  1493. raise R2RException("Document not found.", 404)
  1494. if (
  1495. not auth_user.is_superuser
  1496. and auth_user.id != documents_overview_response[0].owner_id
  1497. ):
  1498. raise R2RException(
  1499. "Only a superuser can extract entities and relationships from a document they do not own.",
  1500. 403,
  1501. )
  1502. # Apply runtime settings overrides
  1503. server_graph_creation_settings = (
  1504. self.providers.database.config.graph_creation_settings
  1505. )
  1506. if settings:
  1507. server_graph_creation_settings = update_settings_from_dict(
  1508. server_settings=server_graph_creation_settings,
  1509. settings_dict=settings, # type: ignore
  1510. )
  1511. workflow_input = {
  1512. "document_id": str(id),
  1513. "graph_creation_settings": server_graph_creation_settings.model_dump_json(),
  1514. "user": auth_user.json(),
  1515. }
  1516. if run_with_orchestration:
  1517. try:
  1518. return await self.providers.orchestration.run_workflow( # type: ignore
  1519. "graph-extraction", {"request": workflow_input}, {}
  1520. )
  1521. except Exception as e: # TODO: Need to find specific errors that we should be excepting (gRPC most likely?)
  1522. logger.error(
  1523. f"Error running orchestrated extraction: {e} \n\nAttempting to run without orchestration."
  1524. )
  1525. from core.main.orchestration import (
  1526. simple_graph_search_results_factory,
  1527. )
  1528. logger.info("Running extract-triples without orchestration.")
  1529. simple_graph_search_results = simple_graph_search_results_factory(
  1530. self.services.graph
  1531. )
  1532. await simple_graph_search_results["graph-extraction"](
  1533. workflow_input
  1534. )
  1535. return { # type: ignore
  1536. "message": "Graph created successfully.",
  1537. "task_id": None,
  1538. }
  1539. @self.router.post(
  1540. "/documents/{id}/deduplicate",
  1541. dependencies=[Depends(self.rate_limit_dependency)],
  1542. summary="Deduplicate entities",
  1543. openapi_extra={
  1544. "x-codeSamples": [
  1545. {
  1546. "lang": "Python",
  1547. "source": textwrap.dedent("""
  1548. from r2r import R2RClient
  1549. client = R2RClient()
  1550. response = client.documents.deduplicate(
  1551. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  1552. )
  1553. """),
  1554. },
  1555. {
  1556. "lang": "JavaScript",
  1557. "source": textwrap.dedent("""
  1558. const { r2rClient } = require("r2r-js");
  1559. const client = new r2rClient();
  1560. function main() {
  1561. const response = await client.documents.deduplicate({
  1562. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1563. });
  1564. }
  1565. main();
  1566. """),
  1567. },
  1568. {
  1569. "lang": "cURL",
  1570. "source": textwrap.dedent("""
  1571. curl -X POST "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa/deduplicate" \\
  1572. -H "Authorization: Bearer YOUR_API_KEY"
  1573. """),
  1574. },
  1575. ],
  1576. },
  1577. )
  1578. @self.base_endpoint
  1579. async def deduplicate(
  1580. id: UUID = Path(
  1581. ...,
  1582. description="The ID of the document to extract entities and relationships from.",
  1583. ),
  1584. settings: Optional[GraphCreationSettings] = Body(
  1585. default=None,
  1586. description="Settings for the entities and relationships extraction process.",
  1587. ),
  1588. run_with_orchestration: Optional[bool] = Body(
  1589. default=True,
  1590. description="Whether to run the entities and relationships extraction process with orchestration.",
  1591. ),
  1592. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1593. ) -> WrappedGenericMessageResponse:
  1594. """Deduplicates entities from a document."""
  1595. settings = settings.model_dump() if settings else None # type: ignore
  1596. documents_overview_response = (
  1597. await self.services.management.documents_overview(
  1598. user_ids=(
  1599. None if auth_user.is_superuser else [auth_user.id]
  1600. ),
  1601. collection_ids=(
  1602. None
  1603. if auth_user.is_superuser
  1604. else auth_user.collection_ids
  1605. ),
  1606. document_ids=[id],
  1607. offset=0,
  1608. limit=1,
  1609. )
  1610. )["results"]
  1611. if len(documents_overview_response) == 0:
  1612. raise R2RException("Document not found.", 404)
  1613. if (
  1614. not auth_user.is_superuser
  1615. and auth_user.id != documents_overview_response[0].owner_id
  1616. ):
  1617. raise R2RException(
  1618. "Only a superuser can run deduplication on a document they do not own.",
  1619. 403,
  1620. )
  1621. # Apply runtime settings overrides
  1622. server_graph_creation_settings = (
  1623. self.providers.database.config.graph_creation_settings
  1624. )
  1625. if settings:
  1626. server_graph_creation_settings = update_settings_from_dict(
  1627. server_settings=server_graph_creation_settings,
  1628. settings_dict=settings, # type: ignore
  1629. )
  1630. if run_with_orchestration:
  1631. try:
  1632. workflow_input = {
  1633. "document_id": str(id),
  1634. }
  1635. return await self.providers.orchestration.run_workflow( # type: ignore
  1636. "graph-deduplication",
  1637. {"request": workflow_input},
  1638. {},
  1639. )
  1640. except Exception as e: # TODO: Need to find specific errors that we should be excepting (gRPC most likely?)
  1641. logger.error(
  1642. f"Error running orchestrated deduplication: {e} \n\nAttempting to run without orchestration."
  1643. )
  1644. from core.main.orchestration import (
  1645. simple_graph_search_results_factory,
  1646. )
  1647. logger.info(
  1648. "Running deduplicate-document-entities without orchestration."
  1649. )
  1650. simple_graph_search_results = simple_graph_search_results_factory(
  1651. self.services.graph
  1652. )
  1653. await simple_graph_search_results["graph-deduplication"](
  1654. workflow_input
  1655. )
  1656. return { # type: ignore
  1657. "message": "Graph created successfully.",
  1658. "task_id": None,
  1659. }
  1660. @self.router.get(
  1661. "/documents/{id}/entities",
  1662. dependencies=[Depends(self.rate_limit_dependency)],
  1663. summary="Lists the entities from the document",
  1664. openapi_extra={
  1665. "x-codeSamples": [
  1666. {
  1667. "lang": "Python",
  1668. "source": textwrap.dedent("""
  1669. from r2r import R2RClient
  1670. client = R2RClient()
  1671. # when using auth, do client.login(...)
  1672. response = client.documents.extract(
  1673. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa"
  1674. )
  1675. """),
  1676. },
  1677. ],
  1678. },
  1679. )
  1680. @self.base_endpoint
  1681. async def get_entities(
  1682. id: UUID = Path(
  1683. ...,
  1684. description="The ID of the document to retrieve entities from.",
  1685. ),
  1686. offset: int = Query(
  1687. 0,
  1688. ge=0,
  1689. description="Specifies the number of objects to skip. Defaults to 0.",
  1690. ),
  1691. limit: int = Query(
  1692. 100,
  1693. ge=1,
  1694. le=1000,
  1695. description="Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.",
  1696. ),
  1697. include_embeddings: Optional[bool] = Query(
  1698. False,
  1699. description="Whether to include vector embeddings in the response.",
  1700. ),
  1701. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1702. ) -> WrappedEntitiesResponse:
  1703. """Retrieves the entities that were extracted from a document.
  1704. These represent important semantic elements like people, places,
  1705. organizations, concepts, etc.
  1706. Users can only access entities from documents they own or have
  1707. access to through collections. Entity embeddings are only included
  1708. if specifically requested.
  1709. Results are returned in the order they were extracted from the
  1710. document.
  1711. """
  1712. # if (
  1713. # not auth_user.is_superuser
  1714. # and id not in auth_user.collection_ids
  1715. # ):
  1716. # raise R2RException(
  1717. # "The currently authenticated user does not have access to the specified collection.",
  1718. # 403,
  1719. # )
  1720. # First check if the document exists and user has access
  1721. documents_overview_response = (
  1722. await self.services.management.documents_overview(
  1723. user_ids=(
  1724. None if auth_user.is_superuser else [auth_user.id]
  1725. ),
  1726. collection_ids=(
  1727. None
  1728. if auth_user.is_superuser
  1729. else auth_user.collection_ids
  1730. ),
  1731. document_ids=[id],
  1732. offset=0,
  1733. limit=1,
  1734. )
  1735. )
  1736. if not documents_overview_response["results"]:
  1737. raise R2RException("Document not found.", 404)
  1738. # Get all entities for this document from the document_entity table
  1739. (
  1740. entities,
  1741. count,
  1742. ) = await self.providers.database.graphs_handler.entities.get(
  1743. parent_id=id,
  1744. store_type=StoreType.DOCUMENTS,
  1745. offset=offset,
  1746. limit=limit,
  1747. include_embeddings=include_embeddings or False,
  1748. )
  1749. return entities, {"total_entries": count} # type: ignore
  1750. @self.router.post(
  1751. "/documents/{id}/entities/export",
  1752. summary="Export document entities to CSV",
  1753. dependencies=[Depends(self.rate_limit_dependency)],
  1754. openapi_extra={
  1755. "x-codeSamples": [
  1756. {
  1757. "lang": "Python",
  1758. "source": textwrap.dedent("""
  1759. from r2r import R2RClient
  1760. client = R2RClient("http://localhost:7272")
  1761. # when using auth, do client.login(...)
  1762. response = client.documents.export_entities(
  1763. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1764. output_path="export.csv",
  1765. columns=["id", "title", "created_at"],
  1766. include_header=True,
  1767. )
  1768. """),
  1769. },
  1770. {
  1771. "lang": "JavaScript",
  1772. "source": textwrap.dedent("""
  1773. const { r2rClient } = require("r2r-js");
  1774. const client = new r2rClient("http://localhost:7272");
  1775. function main() {
  1776. await client.documents.exportEntities({
  1777. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1778. outputPath: "export.csv",
  1779. columns: ["id", "title", "created_at"],
  1780. includeHeader: true,
  1781. });
  1782. }
  1783. main();
  1784. """),
  1785. },
  1786. {
  1787. "lang": "cURL",
  1788. "source": textwrap.dedent("""
  1789. curl -X POST "http://127.0.0.1:7272/v3/documents/export_entities" \
  1790. -H "Authorization: Bearer YOUR_API_KEY" \
  1791. -H "Content-Type: application/json" \
  1792. -H "Accept: text/csv" \
  1793. -d '{ "columns": ["id", "title", "created_at"], "include_header": true }' \
  1794. --output export.csv
  1795. """),
  1796. },
  1797. ]
  1798. },
  1799. )
  1800. @self.base_endpoint
  1801. async def export_entities(
  1802. background_tasks: BackgroundTasks,
  1803. id: UUID = Path(
  1804. ...,
  1805. description="The ID of the document to export entities from.",
  1806. ),
  1807. columns: Optional[list[str]] = Body(
  1808. None, description="Specific columns to export"
  1809. ),
  1810. filters: Optional[dict] = Body(
  1811. None, description="Filters to apply to the export"
  1812. ),
  1813. include_header: Optional[bool] = Body(
  1814. True, description="Whether to include column headers"
  1815. ),
  1816. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1817. ) -> FileResponse:
  1818. """Export documents as a downloadable CSV file."""
  1819. if not auth_user.is_superuser:
  1820. raise R2RException(
  1821. "Only a superuser can export data.",
  1822. 403,
  1823. )
  1824. (
  1825. csv_file_path,
  1826. temp_file,
  1827. ) = await self.services.management.export_document_entities(
  1828. id=id,
  1829. columns=columns,
  1830. filters=filters,
  1831. include_header=include_header
  1832. if include_header is not None
  1833. else True,
  1834. )
  1835. background_tasks.add_task(temp_file.close)
  1836. return FileResponse(
  1837. path=csv_file_path,
  1838. media_type="text/csv",
  1839. filename="documents_export.csv",
  1840. )
  1841. @self.router.get(
  1842. "/documents/{id}/relationships",
  1843. dependencies=[Depends(self.rate_limit_dependency)],
  1844. summary="List document relationships",
  1845. openapi_extra={
  1846. "x-codeSamples": [
  1847. {
  1848. "lang": "Python",
  1849. "source": textwrap.dedent("""
  1850. from r2r import R2RClient
  1851. client = R2RClient()
  1852. # when using auth, do client.login(...)
  1853. response = client.documents.list_relationships(
  1854. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1855. offset=0,
  1856. limit=100
  1857. )
  1858. """),
  1859. },
  1860. {
  1861. "lang": "JavaScript",
  1862. "source": textwrap.dedent("""
  1863. const { r2rClient } = require("r2r-js");
  1864. const client = new r2rClient();
  1865. function main() {
  1866. const response = await client.documents.listRelationships({
  1867. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1868. offset: 0,
  1869. limit: 100,
  1870. });
  1871. }
  1872. main();
  1873. """),
  1874. },
  1875. {
  1876. "lang": "cURL",
  1877. "source": textwrap.dedent("""
  1878. curl -X GET "https://api.example.com/v3/documents/b4ac4dd6-5f27-596e-a55b-7cf242ca30aa/relationships" \\
  1879. -H "Authorization: Bearer YOUR_API_KEY"
  1880. """),
  1881. },
  1882. ]
  1883. },
  1884. )
  1885. @self.base_endpoint
  1886. async def get_relationships(
  1887. id: UUID = Path(
  1888. ...,
  1889. description="The ID of the document to retrieve relationships for.",
  1890. ),
  1891. offset: int = Query(
  1892. 0,
  1893. ge=0,
  1894. description="Specifies the number of objects to skip. Defaults to 0.",
  1895. ),
  1896. limit: int = Query(
  1897. 100,
  1898. ge=1,
  1899. le=1000,
  1900. description="Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.",
  1901. ),
  1902. entity_names: Optional[list[str]] = Query(
  1903. None,
  1904. description="Filter relationships by specific entity names.",
  1905. ),
  1906. relationship_types: Optional[list[str]] = Query(
  1907. None,
  1908. description="Filter relationships by specific relationship types.",
  1909. ),
  1910. auth_user=Depends(self.providers.auth.auth_wrapper()),
  1911. ) -> WrappedRelationshipsResponse:
  1912. """Retrieves the relationships between entities that were extracted
  1913. from a document. These represent connections and interactions
  1914. between entities found in the text.
  1915. Users can only access relationships from documents they own or have
  1916. access to through collections. Results can be filtered by entity
  1917. names and relationship types.
  1918. Results are returned in the order they were extracted from the
  1919. document.
  1920. """
  1921. # if (
  1922. # not auth_user.is_superuser
  1923. # and id not in auth_user.collection_ids
  1924. # ):
  1925. # raise R2RException(
  1926. # "The currently authenticated user does not have access to the specified collection.",
  1927. # 403,
  1928. # )
  1929. # First check if the document exists and user has access
  1930. documents_overview_response = (
  1931. await self.services.management.documents_overview(
  1932. user_ids=(
  1933. None if auth_user.is_superuser else [auth_user.id]
  1934. ),
  1935. collection_ids=(
  1936. None
  1937. if auth_user.is_superuser
  1938. else auth_user.collection_ids
  1939. ),
  1940. document_ids=[id],
  1941. offset=0,
  1942. limit=1,
  1943. )
  1944. )
  1945. if not documents_overview_response["results"]:
  1946. raise R2RException("Document not found.", 404)
  1947. # Get relationships for this document
  1948. (
  1949. relationships,
  1950. count,
  1951. ) = await self.providers.database.graphs_handler.relationships.get(
  1952. parent_id=id,
  1953. store_type=StoreType.DOCUMENTS,
  1954. entity_names=entity_names,
  1955. relationship_types=relationship_types,
  1956. offset=offset,
  1957. limit=limit,
  1958. )
  1959. return relationships, {"total_entries": count} # type: ignore
  1960. @self.router.post(
  1961. "/documents/{id}/relationships/export",
  1962. summary="Export document relationships to CSV",
  1963. dependencies=[Depends(self.rate_limit_dependency)],
  1964. openapi_extra={
  1965. "x-codeSamples": [
  1966. {
  1967. "lang": "Python",
  1968. "source": textwrap.dedent("""
  1969. from r2r import R2RClient
  1970. client = R2RClient("http://localhost:7272")
  1971. # when using auth, do client.login(...)
  1972. response = client.documents.export_entities(
  1973. id="b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1974. output_path="export.csv",
  1975. columns=["id", "title", "created_at"],
  1976. include_header=True,
  1977. )
  1978. """),
  1979. },
  1980. {
  1981. "lang": "JavaScript",
  1982. "source": textwrap.dedent("""
  1983. const { r2rClient } = require("r2r-js");
  1984. const client = new r2rClient("http://localhost:7272");
  1985. function main() {
  1986. await client.documents.exportEntities({
  1987. id: "b4ac4dd6-5f27-596e-a55b-7cf242ca30aa",
  1988. outputPath: "export.csv",
  1989. columns: ["id", "title", "created_at"],
  1990. includeHeader: true,
  1991. });
  1992. }
  1993. main();
  1994. """),
  1995. },
  1996. {
  1997. "lang": "cURL",
  1998. "source": textwrap.dedent("""
  1999. curl -X POST "http://127.0.0.1:7272/v3/documents/export_entities" \
  2000. -H "Authorization: Bearer YOUR_API_KEY" \
  2001. -H "Content-Type: application/json" \
  2002. -H "Accept: text/csv" \
  2003. -d '{ "columns": ["id", "title", "created_at"], "include_header": true }' \
  2004. --output export.csv
  2005. """),
  2006. },
  2007. ]
  2008. },
  2009. )
  2010. @self.base_endpoint
  2011. async def export_relationships(
  2012. background_tasks: BackgroundTasks,
  2013. id: UUID = Path(
  2014. ...,
  2015. description="The ID of the document to export entities from.",
  2016. ),
  2017. columns: Optional[list[str]] = Body(
  2018. None, description="Specific columns to export"
  2019. ),
  2020. filters: Optional[dict] = Body(
  2021. None, description="Filters to apply to the export"
  2022. ),
  2023. include_header: Optional[bool] = Body(
  2024. True, description="Whether to include column headers"
  2025. ),
  2026. auth_user=Depends(self.providers.auth.auth_wrapper()),
  2027. ) -> FileResponse:
  2028. """Export documents as a downloadable CSV file."""
  2029. if not auth_user.is_superuser:
  2030. raise R2RException(
  2031. "Only a superuser can export data.",
  2032. 403,
  2033. )
  2034. (
  2035. csv_file_path,
  2036. temp_file,
  2037. ) = await self.services.management.export_document_relationships(
  2038. id=id,
  2039. columns=columns,
  2040. filters=filters,
  2041. include_header=include_header
  2042. if include_header is not None
  2043. else True,
  2044. )
  2045. background_tasks.add_task(temp_file.close)
  2046. return FileResponse(
  2047. path=csv_file_path,
  2048. media_type="text/csv",
  2049. filename="documents_export.csv",
  2050. )
  2051. @self.router.post(
  2052. "/documents/search",
  2053. dependencies=[Depends(self.rate_limit_dependency)],
  2054. summary="Search document summaries",
  2055. )
  2056. @self.base_endpoint
  2057. async def search_documents(
  2058. query: str = Body(
  2059. ...,
  2060. description="The search query to perform.",
  2061. ),
  2062. search_mode: SearchMode = Body(
  2063. default=SearchMode.custom,
  2064. description=(
  2065. "Default value of `custom` allows full control over search settings.\n\n"
  2066. "Pre-configured search modes:\n"
  2067. "`basic`: A simple semantic-based search.\n"
  2068. "`advanced`: A more powerful hybrid search combining semantic and full-text.\n"
  2069. "`custom`: Full control via `search_settings`.\n\n"
  2070. "If `filters` or `limit` are provided alongside `basic` or `advanced`, "
  2071. "they will override the default settings for that mode."
  2072. ),
  2073. ),
  2074. search_settings: SearchSettings = Body(
  2075. default_factory=SearchSettings,
  2076. description="Settings for document search",
  2077. ),
  2078. auth_user=Depends(self.providers.auth.auth_wrapper()),
  2079. ) -> WrappedDocumentSearchResponse:
  2080. """Perform a search query on the automatically generated document
  2081. summaries in the system.
  2082. This endpoint allows for complex filtering of search results using PostgreSQL-based queries.
  2083. Filters can be applied to various fields such as document_id, and internal metadata values.
  2084. Allowed operators include `eq`, `neq`, `gt`, `gte`, `lt`, `lte`, `like`, `ilike`, `in`, and `nin`.
  2085. """
  2086. effective_settings = self._prepare_search_settings(
  2087. auth_user, search_mode, search_settings
  2088. )
  2089. query_embedding = (
  2090. await self.providers.embedding.async_get_embedding(query)
  2091. )
  2092. results = await self.services.retrieval.search_documents(
  2093. query=query,
  2094. query_embedding=query_embedding,
  2095. settings=effective_settings,
  2096. )
  2097. return results # type: ignore
  2098. @staticmethod
  2099. async def _process_file(file):
  2100. import base64
  2101. content = await file.read()
  2102. return {
  2103. "filename": file.filename,
  2104. "content": base64.b64encode(content).decode("utf-8"),
  2105. "content_type": file.content_type,
  2106. "content_length": len(content),
  2107. }