documents.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610
  1. import json
  2. from datetime import datetime
  3. from io import BytesIO
  4. from pathlib import Path
  5. from typing import Any, Optional
  6. from uuid import UUID
  7. import aiofiles
  8. from shared.api.models.base import WrappedBooleanResponse
  9. from shared.api.models.ingestion.responses import WrappedIngestionResponse
  10. from shared.api.models.management.responses import (
  11. WrappedChunksResponse,
  12. WrappedCollectionsResponse,
  13. WrappedDocumentResponse,
  14. WrappedDocumentsResponse,
  15. )
  16. from ..models import IngestionMode, SearchMode, SearchSettings
  17. class DocumentsSDK:
  18. """
  19. SDK for interacting with documents in the v3 API.
  20. """
  21. def __init__(self, client):
  22. self.client = client
  23. async def create(
  24. self,
  25. file_path: Optional[str] = None,
  26. raw_text: Optional[str] = None,
  27. chunks: Optional[list[str]] = None,
  28. id: Optional[str | UUID] = None,
  29. ingestion_mode: Optional[str] = None,
  30. collection_ids: Optional[list[str | UUID]] = None,
  31. metadata: Optional[dict] = None,
  32. ingestion_config: Optional[dict | IngestionMode] = None,
  33. run_with_orchestration: Optional[bool] = True,
  34. ) -> WrappedIngestionResponse:
  35. """
  36. Create a new document from either a file or content.
  37. Args:
  38. file_path (Optional[str]): The file to upload, if any
  39. content (Optional[str]): Optional text content to upload, if no file path is provided
  40. id (Optional[str | UUID]): Optional ID to assign to the document
  41. collection_ids (Optional[list[str | UUID]]): Collection IDs to associate with the document. If none are provided, the document will be assigned to the user's default collection.
  42. metadata (Optional[dict]): Optional metadata to assign to the document
  43. ingestion_config (Optional[dict]): Optional ingestion configuration to use
  44. run_with_orchestration (Optional[bool]): Whether to run with orchestration
  45. """
  46. if not file_path and not raw_text and not chunks:
  47. raise ValueError(
  48. "Either `file_path`, `raw_text` or `chunks` must be provided"
  49. )
  50. if (
  51. (file_path and raw_text)
  52. or (file_path and chunks)
  53. or (raw_text and chunks)
  54. ):
  55. raise ValueError(
  56. "Only one of `file_path`, `raw_text` or `chunks` may be provided"
  57. )
  58. data: dict[str, Any] = {}
  59. files = None
  60. if id:
  61. data["id"] = str(id)
  62. if metadata:
  63. data["metadata"] = json.dumps(metadata)
  64. if ingestion_config:
  65. if isinstance(ingestion_config, IngestionMode):
  66. ingestion_config = {"mode": ingestion_config.value}
  67. app_config: dict[str, Any] = (
  68. {}
  69. if isinstance(ingestion_config, dict)
  70. else ingestion_config["app"]
  71. )
  72. ingestion_config = dict(ingestion_config)
  73. ingestion_config["app"] = app_config
  74. data["ingestion_config"] = json.dumps(ingestion_config)
  75. if collection_ids:
  76. collection_ids = [str(collection_id) for collection_id in collection_ids] # type: ignore
  77. data["collection_ids"] = json.dumps(collection_ids)
  78. if run_with_orchestration is not None:
  79. data["run_with_orchestration"] = str(run_with_orchestration)
  80. if ingestion_mode is not None:
  81. data["ingestion_mode"] = ingestion_mode
  82. if file_path:
  83. # Create a new file instance that will remain open during the request
  84. file_instance = open(file_path, "rb")
  85. files = [
  86. (
  87. "file",
  88. (file_path, file_instance, "application/octet-stream"),
  89. )
  90. ]
  91. try:
  92. result = await self.client._make_request(
  93. "POST",
  94. "documents",
  95. data=data,
  96. files=files,
  97. version="v3",
  98. )
  99. finally:
  100. # Ensure we close the file after the request is complete
  101. file_instance.close()
  102. return result
  103. elif raw_text:
  104. data["raw_text"] = raw_text # type: ignore
  105. return await self.client._make_request(
  106. "POST",
  107. "documents",
  108. data=data,
  109. version="v3",
  110. )
  111. else:
  112. data["chunks"] = json.dumps(chunks)
  113. return await self.client._make_request(
  114. "POST",
  115. "documents",
  116. data=data,
  117. version="v3",
  118. )
  119. async def retrieve(
  120. self,
  121. id: str | UUID,
  122. ) -> WrappedDocumentResponse:
  123. """
  124. Get a specific document by ID.
  125. Args:
  126. id (str | UUID): ID of document to retrieve
  127. Returns:
  128. dict: Document information
  129. """
  130. return await self.client._make_request(
  131. "GET",
  132. f"documents/{str(id)}",
  133. version="v3",
  134. )
  135. async def download(
  136. self,
  137. id: str | UUID,
  138. ) -> BytesIO:
  139. response = await self.client._make_request(
  140. "GET",
  141. f"documents/{str(id)}/download",
  142. version="v3",
  143. )
  144. if not isinstance(response, BytesIO):
  145. raise ValueError("Expected BytesIO response")
  146. return response
  147. async def download_zip(
  148. self,
  149. document_ids: Optional[list[str | UUID]] = None,
  150. start_date: Optional[datetime] = None,
  151. end_date: Optional[datetime] = None,
  152. output_path: Optional[str | Path] = None,
  153. ) -> BytesIO | None:
  154. """
  155. Download multiple documents as a zip file.
  156. """
  157. params: dict[str, Any] = {}
  158. if document_ids:
  159. params["document_ids"] = [str(doc_id) for doc_id in document_ids]
  160. if start_date:
  161. params["start_date"] = start_date.isoformat()
  162. if end_date:
  163. params["end_date"] = end_date.isoformat()
  164. response = await self.client._make_request(
  165. "GET",
  166. "documents/download_zip",
  167. params=params,
  168. version="v3",
  169. )
  170. if not isinstance(response, BytesIO):
  171. raise ValueError("Expected BytesIO response")
  172. if output_path:
  173. output_path = (
  174. Path(output_path)
  175. if isinstance(output_path, str)
  176. else output_path
  177. )
  178. async with aiofiles.open(output_path, "wb") as f:
  179. await f.write(response.getvalue())
  180. return None
  181. return response
  182. async def export(
  183. self,
  184. output_path: str | Path,
  185. columns: Optional[list[str]] = None,
  186. filters: Optional[dict] = None,
  187. include_header: bool = True,
  188. ) -> None:
  189. """
  190. Export documents to a CSV file, streaming the results directly to disk.
  191. Args:
  192. output_path (str | Path): Local path where the CSV file should be saved
  193. columns (Optional[list[str]]): Specific columns to export. If None, exports default columns
  194. filters (Optional[dict]): Optional filters to apply when selecting documents
  195. include_header (bool): Whether to include column headers in the CSV (default: True)
  196. """
  197. # Convert path to string if it's a Path object
  198. output_path = (
  199. str(output_path) if isinstance(output_path, Path) else output_path
  200. )
  201. data: dict[str, Any] = {"include_header": include_header}
  202. if columns:
  203. data["columns"] = columns
  204. if filters:
  205. data["filters"] = filters
  206. # Stream response directly to file
  207. async with aiofiles.open(output_path, "wb") as f:
  208. async with self.client.session.post(
  209. f"{self.client.base_url}/v3/documents/export",
  210. json=data,
  211. headers={
  212. "Accept": "text/csv",
  213. **self.client._get_auth_headers(),
  214. },
  215. ) as response:
  216. if response.status != 200:
  217. raise ValueError(
  218. f"Export failed with status {response.status}",
  219. response,
  220. )
  221. async for chunk in response.content.iter_chunks():
  222. if chunk:
  223. await f.write(chunk[0])
  224. async def export_entities(
  225. self,
  226. id: str | UUID,
  227. output_path: str | Path,
  228. columns: Optional[list[str]] = None,
  229. filters: Optional[dict] = None,
  230. include_header: bool = True,
  231. ) -> None:
  232. """
  233. Export documents to a CSV file, streaming the results directly to disk.
  234. Args:
  235. output_path (str | Path): Local path where the CSV file should be saved
  236. columns (Optional[list[str]]): Specific columns to export. If None, exports default columns
  237. filters (Optional[dict]): Optional filters to apply when selecting documents
  238. include_header (bool): Whether to include column headers in the CSV (default: True)
  239. """
  240. # Convert path to string if it's a Path object
  241. output_path = (
  242. str(output_path) if isinstance(output_path, Path) else output_path
  243. )
  244. # Prepare request data
  245. data: dict[str, Any] = {"include_header": include_header}
  246. if columns:
  247. data["columns"] = columns
  248. if filters:
  249. data["filters"] = filters
  250. # Stream response directly to file
  251. async with aiofiles.open(output_path, "wb") as f:
  252. async with self.client.session.post(
  253. f"{self.client.base_url}/v3/documents/{str(id)}/entities/export",
  254. json=data,
  255. headers={
  256. "Accept": "text/csv",
  257. **self.client._get_auth_headers(),
  258. },
  259. ) as response:
  260. if response.status != 200:
  261. raise ValueError(
  262. f"Export failed with status {response.status}",
  263. response,
  264. )
  265. async for chunk in response.content.iter_chunks():
  266. if chunk:
  267. await f.write(chunk[0])
  268. async def export_relationships(
  269. self,
  270. id: str | UUID,
  271. output_path: str | Path,
  272. columns: Optional[list[str]] = None,
  273. filters: Optional[dict] = None,
  274. include_header: bool = True,
  275. ) -> None:
  276. """
  277. Export document relationships to a CSV file, streaming the results directly to disk.
  278. Args:
  279. output_path (str | Path): Local path where the CSV file should be saved
  280. columns (Optional[list[str]]): Specific columns to export. If None, exports default columns
  281. filters (Optional[dict]): Optional filters to apply when selecting documents
  282. include_header (bool): Whether to include column headers in the CSV (default: True)
  283. """
  284. # Convert path to string if it's a Path object
  285. output_path = (
  286. str(output_path) if isinstance(output_path, Path) else output_path
  287. )
  288. # Prepare request data
  289. data: dict[str, Any] = {"include_header": include_header}
  290. if columns:
  291. data["columns"] = columns
  292. if filters:
  293. data["filters"] = filters
  294. # Stream response directly to file
  295. async with aiofiles.open(output_path, "wb") as f:
  296. async with self.client.session.post(
  297. f"{self.client.base_url}/v3/documents/{str(id)}/relationships/export",
  298. json=data,
  299. headers={
  300. "Accept": "text/csv",
  301. **self.client._get_auth_headers(),
  302. },
  303. ) as response:
  304. if response.status != 200:
  305. raise ValueError(
  306. f"Export failed with status {response.status}",
  307. response,
  308. )
  309. async for chunk in response.content.iter_chunks():
  310. if chunk:
  311. await f.write(chunk[0])
  312. async def delete(
  313. self,
  314. id: str | UUID,
  315. ) -> WrappedBooleanResponse:
  316. """
  317. Delete a specific document.
  318. Args:
  319. id (str | UUID): ID of document to delete
  320. """
  321. return await self.client._make_request(
  322. "DELETE",
  323. f"documents/{str(id)}",
  324. version="v3",
  325. )
  326. async def list_chunks(
  327. self,
  328. id: str | UUID,
  329. include_vectors: Optional[bool] = False,
  330. offset: Optional[int] = 0,
  331. limit: Optional[int] = 100,
  332. ) -> WrappedChunksResponse:
  333. """
  334. Get chunks for a specific document.
  335. Args:
  336. id (str | UUID): ID of document to retrieve chunks for
  337. include_vectors (Optional[bool]): Whether to include vector embeddings in the response
  338. offset (int, optional): Specifies the number of objects to skip. Defaults to 0.
  339. limit (int, optional): Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.
  340. Returns:
  341. dict: List of document chunks and pagination information
  342. """
  343. params = {
  344. "offset": offset,
  345. "limit": limit,
  346. "include_vectors": include_vectors,
  347. }
  348. return await self.client._make_request(
  349. "GET",
  350. f"documents/{str(id)}/chunks",
  351. params=params,
  352. version="v3",
  353. )
  354. async def list_collections(
  355. self,
  356. id: str | UUID,
  357. include_vectors: Optional[bool] = False,
  358. offset: Optional[int] = 0,
  359. limit: Optional[int] = 100,
  360. ) -> WrappedCollectionsResponse:
  361. """
  362. List collections for a specific document.
  363. Args:
  364. id (str | UUID): ID of document to retrieve collections for
  365. offset (int, optional): Specifies the number of objects to skip. Defaults to 0.
  366. limit (int, optional): Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.
  367. Returns:
  368. dict: List of document chunks and pagination information
  369. """
  370. params = {
  371. "offset": offset,
  372. "limit": limit,
  373. }
  374. return await self.client._make_request(
  375. "GET",
  376. f"documents/{str(id)}/collections",
  377. params=params,
  378. version="v3",
  379. )
  380. async def delete_by_filter(
  381. self,
  382. filters: dict,
  383. ) -> WrappedBooleanResponse:
  384. """
  385. Delete documents based on filters.
  386. Args:
  387. filters (dict): Filters to apply when selecting documents to delete
  388. """
  389. filters_json = json.dumps(filters)
  390. return await self.client._make_request(
  391. "DELETE",
  392. "documents/by-filter",
  393. data=filters_json,
  394. # params={"filters": filters_json},
  395. # data=filters,
  396. version="v3",
  397. )
  398. async def extract(
  399. self,
  400. id: str | UUID,
  401. run_type: Optional[str] = None,
  402. settings: Optional[dict] = None,
  403. run_with_orchestration: Optional[bool] = True,
  404. ) -> dict:
  405. """
  406. Extract entities and relationships from a document.
  407. Args:
  408. id (str, UUID): ID of document to extract from
  409. run_type (Optional[str]): Whether to return an estimate or run extraction
  410. settings (Optional[dict]): Settings for extraction process
  411. run_with_orchestration (Optional[bool]): Whether to run with orchestration
  412. Returns:
  413. dict: Extraction results or cost estimate
  414. """
  415. data: dict[str, Any] = {}
  416. if run_type:
  417. data["run_type"] = run_type
  418. if settings:
  419. data["settings"] = json.dumps(settings)
  420. if run_with_orchestration is not None:
  421. data["run_with_orchestration"] = str(run_with_orchestration)
  422. return await self.client._make_request(
  423. "POST",
  424. f"documents/{str(id)}/extract",
  425. params=data,
  426. version="v3",
  427. )
  428. async def list_entities(
  429. self,
  430. id: str | UUID,
  431. offset: Optional[int] = 0,
  432. limit: Optional[int] = 100,
  433. include_embeddings: Optional[bool] = False,
  434. ) -> dict:
  435. """
  436. List entities extracted from a document.
  437. Args:
  438. id (str | UUID): ID of document to get entities from
  439. offset (Optional[int]): Number of items to skip
  440. limit (Optional[int]): Max number of items to return
  441. include_embeddings (Optional[bool]): Whether to include embeddings
  442. Returns:
  443. dict: List of entities and pagination info
  444. """
  445. params = {
  446. "offset": offset,
  447. "limit": limit,
  448. "include_embeddings": include_embeddings,
  449. }
  450. return await self.client._make_request(
  451. "GET",
  452. f"documents/{str(id)}/entities",
  453. params=params,
  454. version="v3",
  455. )
  456. async def list_relationships(
  457. self,
  458. id: str | UUID,
  459. offset: Optional[int] = 0,
  460. limit: Optional[int] = 100,
  461. entity_names: Optional[list[str]] = None,
  462. relationship_types: Optional[list[str]] = None,
  463. ) -> dict:
  464. """
  465. List relationships extracted from a document.
  466. Args:
  467. id (str | UUID): ID of document to get relationships from
  468. offset (Optional[int]): Number of items to skip
  469. limit (Optional[int]): Max number of items to return
  470. entity_names (Optional[list[str]]): Filter by entity names
  471. relationship_types (Optional[list[str]]): Filter by relationship types
  472. Returns:
  473. dict: List of relationships and pagination info
  474. """
  475. params: dict[str, Any] = {
  476. "offset": offset,
  477. "limit": limit,
  478. }
  479. if entity_names:
  480. params["entity_names"] = entity_names
  481. if relationship_types:
  482. params["relationship_types"] = relationship_types
  483. return await self.client._make_request(
  484. "GET",
  485. f"documents/{str(id)}/relationships",
  486. params=params,
  487. version="v3",
  488. )
  489. async def list(
  490. self,
  491. ids: Optional[list[str | UUID]] = None,
  492. offset: Optional[int] = 0,
  493. limit: Optional[int] = 100,
  494. ) -> WrappedDocumentsResponse:
  495. """
  496. List documents with pagination.
  497. Args:
  498. ids (Optional[list[str | UUID]]): Optional list of document IDs to filter by
  499. offset (int, optional): Specifies the number of objects to skip. Defaults to 0.
  500. limit (int, optional): Specifies a limit on the number of objects to return, ranging between 1 and 100. Defaults to 100.
  501. Returns:
  502. dict: List of documents and pagination information
  503. """
  504. params = {
  505. "offset": offset,
  506. "limit": limit,
  507. }
  508. if ids:
  509. params["ids"] = [str(doc_id) for doc_id in ids] # type: ignore
  510. return await self.client._make_request(
  511. "GET",
  512. "documents",
  513. params=params,
  514. version="v3",
  515. )
  516. async def search(
  517. self,
  518. query: str,
  519. search_mode: Optional[str | SearchMode] = "custom",
  520. search_settings: Optional[dict | SearchSettings] = None,
  521. ):
  522. """
  523. Conduct a vector and/or KG search.
  524. Args:
  525. query (str): The query to search for.
  526. search_settings (Optional[dict, SearchSettings]]): Vector search settings.
  527. Returns:
  528. CombinedSearchResponse: The search response.
  529. """
  530. # if search_mode and not isinstance(search_mode, str):
  531. # search_mode = search_mode.value
  532. if search_settings and not isinstance(search_settings, dict):
  533. search_settings = search_settings.model_dump()
  534. data: dict[str, Any] = {
  535. "query": query,
  536. "search_settings": search_settings,
  537. }
  538. if search_mode:
  539. data["search_mode"] = search_mode
  540. return await self.client._make_request(
  541. "POST",
  542. "documents/search",
  543. json=data,
  544. version="v3",
  545. )