documents.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. import json
  2. from io import BytesIO
  3. from typing import Optional
  4. from uuid import UUID
  5. from shared.api.models.base import WrappedBooleanResponse
  6. from shared.api.models.ingestion.responses import WrappedIngestionResponse
  7. from shared.api.models.management.responses import (
  8. WrappedChunksResponse,
  9. WrappedCollectionsResponse,
  10. WrappedDocumentResponse,
  11. WrappedDocumentsResponse,
  12. )
  13. from ..models import IngestionMode, SearchMode, SearchSettings
  14. class DocumentsSDK:
  15. """
  16. SDK for interacting with documents in the v3 API.
  17. """
  18. def __init__(self, client):
  19. self.client = client
  20. async def create(
  21. self,
  22. file_path: Optional[str] = None,
  23. raw_text: Optional[str] = None,
  24. chunks: Optional[list[str]] = None,
  25. id: Optional[str | UUID] = None,
  26. ingestion_mode: Optional[str] = None,
  27. collection_ids: Optional[list[str | UUID]] = None,
  28. metadata: Optional[dict] = None,
  29. ingestion_config: Optional[dict | IngestionMode] = None,
  30. run_with_orchestration: Optional[bool] = True,
  31. ) -> WrappedIngestionResponse:
  32. """
  33. Create a new document from either a file or content.
  34. Args:
  35. file_path (Optional[str]): The file to upload, if any
  36. content (Optional[str]): Optional text content to upload, if no file path is provided
  37. id (Optional[Union[str, UUID]]): Optional ID to assign to the document
  38. collection_ids (Optional[list[Union[str, UUID]]]): Collection IDs to associate with the document. If none are provided, the document will be assigned to the user's default collection.
  39. metadata (Optional[dict]): Optional metadata to assign to the document
  40. ingestion_config (Optional[dict]): Optional ingestion configuration to use
  41. run_with_orchestration (Optional[bool]): Whether to run with orchestration
  42. """
  43. if not file_path and not raw_text and not chunks:
  44. raise ValueError(
  45. "Either `file_path`, `raw_text` or `chunks` must be provided"
  46. )
  47. if (
  48. (file_path and raw_text)
  49. or (file_path and chunks)
  50. or (raw_text and chunks)
  51. ):
  52. raise ValueError(
  53. "Only one of `file_path`, `raw_text` or `chunks` may be provided"
  54. )
  55. data = {}
  56. files = None
  57. if id:
  58. data["id"] = str(id) # json.dumps(str(id))
  59. if metadata:
  60. data["metadata"] = json.dumps(metadata)
  61. if ingestion_config:
  62. if not isinstance(ingestion_config, dict):
  63. ingestion_config = ingestion_config.model_dump()
  64. ingestion_config["app"] = {}
  65. data["ingestion_config"] = json.dumps(ingestion_config)
  66. if collection_ids:
  67. collection_ids = [str(collection_id) for collection_id in collection_ids] # type: ignore
  68. data["collection_ids"] = json.dumps(collection_ids)
  69. if run_with_orchestration is not None:
  70. data["run_with_orchestration"] = str(run_with_orchestration)
  71. if ingestion_mode is not None:
  72. data["ingestion_mode"] = ingestion_mode
  73. if file_path:
  74. # Create a new file instance that will remain open during the request
  75. file_instance = open(file_path, "rb")
  76. files = [
  77. (
  78. "file",
  79. (file_path, file_instance, "application/octet-stream"),
  80. )
  81. ]
  82. try:
  83. result = await self.client._make_request(
  84. "POST",
  85. "documents",
  86. data=data,
  87. files=files,
  88. version="v3",
  89. )
  90. finally:
  91. # Ensure we close the file after the request is complete
  92. file_instance.close()
  93. return result
  94. elif raw_text:
  95. data["raw_text"] = raw_text # type: ignore
  96. return await self.client._make_request(
  97. "POST",
  98. "documents",
  99. data=data,
  100. version="v3",
  101. )
  102. else:
  103. data["chunks"] = json.dumps(chunks)
  104. return await self.client._make_request(
  105. "POST",
  106. "documents",
  107. data=data,
  108. version="v3",
  109. )
  110. async def retrieve(
  111. self,
  112. id: str | UUID,
  113. ) -> WrappedDocumentResponse:
  114. """
  115. Get a specific document by ID.
  116. Args:
  117. id (Union[str, UUID]): ID of document to retrieve
  118. Returns:
  119. dict: Document information
  120. """
  121. return await self.client._make_request(
  122. "GET",
  123. f"documents/{str(id)}",
  124. version="v3",
  125. )
  126. # you could do something like:
  127. async def download(
  128. self,
  129. id: str | UUID,
  130. ) -> BytesIO:
  131. response = await self.client._make_request(
  132. "GET",
  133. f"documents/{str(id)}/download",
  134. version="v3",
  135. # No json parsing here, if possible
  136. )
  137. if not isinstance(response, BytesIO):
  138. raise ValueError("Expected BytesIO response")
  139. return response
  140. async def delete(
  141. self,
  142. id: str | UUID,
  143. ) -> WrappedBooleanResponse:
  144. """
  145. Delete a specific document.
  146. Args:
  147. id (Union[str, UUID]): ID of document to delete
  148. """
  149. return await self.client._make_request(
  150. "DELETE",
  151. f"documents/{str(id)}",
  152. version="v3",
  153. )
  154. async def list_chunks(
  155. self,
  156. id: str | UUID,
  157. include_vectors: Optional[bool] = False,
  158. offset: Optional[int] = 0,
  159. limit: Optional[int] = 100,
  160. ) -> WrappedChunksResponse:
  161. """
  162. Get chunks for a specific document.
  163. Args:
  164. id (Union[str, UUID]): ID of document to retrieve chunks for
  165. include_vectors (Optional[bool]): Whether to include vector embeddings in the response
  166. offset (int, optional): Specifies the number of objects to skip. Defaults to 0.
  167. limit (int, optional): Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.
  168. Returns:
  169. dict: List of document chunks and pagination information
  170. """
  171. params = {
  172. "offset": offset,
  173. "limit": limit,
  174. "include_vectors": include_vectors,
  175. }
  176. return await self.client._make_request(
  177. "GET",
  178. f"documents/{str(id)}/chunks",
  179. params=params,
  180. version="v3",
  181. )
  182. async def list_collections(
  183. self,
  184. id: str | UUID,
  185. include_vectors: Optional[bool] = False,
  186. offset: Optional[int] = 0,
  187. limit: Optional[int] = 100,
  188. ) -> WrappedCollectionsResponse:
  189. """
  190. List collections for a specific document.
  191. Args:
  192. id (Union[str, UUID]): ID of document to retrieve collections for
  193. offset (int, optional): Specifies the number of objects to skip. Defaults to 0.
  194. limit (int, optional): Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.
  195. Returns:
  196. dict: List of document chunks and pagination information
  197. """
  198. params = {
  199. "offset": offset,
  200. "limit": limit,
  201. }
  202. return await self.client._make_request(
  203. "GET",
  204. f"documents/{str(id)}/collections",
  205. params=params,
  206. version="v3",
  207. )
  208. async def delete_by_filter(
  209. self,
  210. filters: dict,
  211. ) -> WrappedBooleanResponse:
  212. """
  213. Delete documents based on filters.
  214. Args:
  215. filters (dict): Filters to apply when selecting documents to delete
  216. """
  217. filters_json = json.dumps(filters)
  218. return await self.client._make_request(
  219. "DELETE",
  220. "documents/by-filter",
  221. data=filters_json,
  222. # params={"filters": filters_json},
  223. # data=filters,
  224. version="v3",
  225. )
  226. async def extract(
  227. self,
  228. id: str | UUID,
  229. run_type: Optional[str] = None,
  230. settings: Optional[dict] = None,
  231. run_with_orchestration: Optional[bool] = True,
  232. ) -> dict:
  233. """
  234. Extract entities and relationships from a document.
  235. Args:
  236. id (Union[str, UUID]): ID of document to extract from
  237. run_type (Optional[str]): Whether to return an estimate or run extraction
  238. settings (Optional[dict]): Settings for extraction process
  239. run_with_orchestration (Optional[bool]): Whether to run with orchestration
  240. Returns:
  241. dict: Extraction results or cost estimate
  242. """
  243. data = {}
  244. if run_type:
  245. data["run_type"] = run_type
  246. if settings:
  247. data["settings"] = json.dumps(settings)
  248. if run_with_orchestration is not None:
  249. data["run_with_orchestration"] = str(run_with_orchestration)
  250. return await self.client._make_request(
  251. "POST",
  252. f"documents/{str(id)}/extract",
  253. params=data,
  254. version="v3",
  255. )
  256. async def list_entities(
  257. self,
  258. id: str | UUID,
  259. offset: Optional[int] = 0,
  260. limit: Optional[int] = 100,
  261. include_embeddings: Optional[bool] = False,
  262. ) -> dict:
  263. """
  264. List entities extracted from a document.
  265. Args:
  266. id (Union[str, UUID]): ID of document to get entities from
  267. offset (Optional[int]): Number of items to skip
  268. limit (Optional[int]): Max number of items to return
  269. include_embeddings (Optional[bool]): Whether to include embeddings
  270. Returns:
  271. dict: List of entities and pagination info
  272. """
  273. params = {
  274. "offset": offset,
  275. "limit": limit,
  276. "include_embeddings": include_embeddings,
  277. }
  278. return await self.client._make_request(
  279. "GET",
  280. f"documents/{str(id)}/entities",
  281. params=params,
  282. version="v3",
  283. )
  284. async def list_relationships(
  285. self,
  286. id: str | UUID,
  287. offset: Optional[int] = 0,
  288. limit: Optional[int] = 100,
  289. entity_names: Optional[list[str]] = None,
  290. relationship_types: Optional[list[str]] = None,
  291. ) -> dict:
  292. """
  293. List relationships extracted from a document.
  294. Args:
  295. id (Union[str, UUID]): ID of document to get relationships from
  296. offset (Optional[int]): Number of items to skip
  297. limit (Optional[int]): Max number of items to return
  298. entity_names (Optional[list[str]]): Filter by entity names
  299. relationship_types (Optional[list[str]]): Filter by relationship types
  300. Returns:
  301. dict: List of relationships and pagination info
  302. """
  303. params = {
  304. "offset": offset,
  305. "limit": limit,
  306. }
  307. if entity_names:
  308. params["entity_names"] = entity_names
  309. if relationship_types:
  310. params["relationship_types"] = relationship_types
  311. return await self.client._make_request(
  312. "GET",
  313. f"documents/{str(id)}/relationships",
  314. params=params,
  315. version="v3",
  316. )
  317. # async def extract(
  318. # self,
  319. # id: str | UUID,
  320. # run_type: Optional[str] = None,
  321. # run_with_orchestration: Optional[bool] = True,
  322. # ):
  323. # data = {}
  324. # if run_type:
  325. # data["run_type"] = run_type
  326. # if run_with_orchestration is not None:
  327. # data["run_with_orchestration"] = str(run_with_orchestration)
  328. # return await self.client._make_request(
  329. # "POST",
  330. # f"documents/{str(id)}/extract",
  331. # params=data,
  332. # version="v3",
  333. # )
  334. # Be sure to put at bottom of the page...
  335. async def list(
  336. self,
  337. ids: Optional[list[str | UUID]] = None,
  338. offset: Optional[int] = 0,
  339. limit: Optional[int] = 100,
  340. ) -> WrappedDocumentsResponse:
  341. """
  342. List documents with pagination.
  343. Args:
  344. ids (Optional[list[Union[str, UUID]]]): Optional list of document IDs to filter by
  345. offset (int, optional): Specifies the number of objects to skip. Defaults to 0.
  346. limit (int, optional): Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.
  347. Returns:
  348. dict: List of documents and pagination information
  349. """
  350. params = {
  351. "offset": offset,
  352. "limit": limit,
  353. }
  354. if ids:
  355. params["ids"] = [str(doc_id) for doc_id in ids] # type: ignore
  356. return await self.client._make_request(
  357. "GET",
  358. "documents",
  359. params=params,
  360. version="v3",
  361. )
  362. async def search(
  363. self,
  364. query: str,
  365. search_mode: Optional[str | SearchMode] = "custom",
  366. search_settings: Optional[dict | SearchSettings] = None,
  367. ):
  368. """
  369. Conduct a vector and/or KG search.
  370. Args:
  371. query (str): The query to search for.
  372. search_settings (Optional[dict, SearchSettings]]): Vector search settings.
  373. Returns:
  374. CombinedSearchResponse: The search response.
  375. """
  376. # if search_mode and not isinstance(search_mode, str):
  377. # search_mode = search_mode.value
  378. if search_settings and not isinstance(search_settings, dict):
  379. search_settings = search_settings.model_dump()
  380. data = {
  381. "query": query,
  382. "search_settings": search_settings,
  383. }
  384. if search_mode:
  385. data["search_mode"] = search_mode
  386. return await self.client._make_request(
  387. "POST",
  388. "documents/search",
  389. json=data,
  390. version="v3",
  391. )