file_search_tool.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. from typing import Type, List
  2. from pydantic import BaseModel, Field
  3. from sqlalchemy.orm import Session
  4. from app.core.tools.base_tool import BaseTool
  5. from app.models.run import Run
  6. from app.services.file.file import FileService
  7. from app.services.assistant.assistant import AssistantService
  8. # import asyncio
  9. import nest_asyncio
  10. # 使得异步代码可以在已运行的事件循环中嵌套
  11. nest_asyncio.apply()
  12. # asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
  13. """
  14. class FileSearchToolInput(BaseModel):
  15. indexes: List[int] = Field(
  16. ..., description="file index list to look up in retrieval"
  17. )
  18. query: str = Field(..., description="query to look up in retrieval")
  19. """
  20. class FileSearchToolInput(BaseModel):
  21. query: str = Field(..., description="query to look up in retrieval")
  22. class FileSearchTool(BaseTool):
  23. name: str = "file_search"
  24. description: str = (
  25. "Can be used to look up information that was uploaded to this assistant."
  26. # "If the user is referencing particular files, that is often a good hint that information may be here."
  27. "A search engine optimized for comprehensive, accurate, and trusted results. "
  28. "Useful for when you need to answer questions about current events. "
  29. "Input should be a search query."
  30. )
  31. args_schema: Type[BaseModel] = FileSearchToolInput
  32. def __init__(self) -> None:
  33. super().__init__()
  34. self.__filenames = []
  35. self.__keys = []
  36. self.__dirkeys = []
  37. self.loop = None
  38. '''
  39. def configure(self, session: Session, run: Run, **kwargs):
  40. # 获取当前事件循环
  41. # document_id = []
  42. file_key = []
  43. # filesinfo = []
  44. # 后语要从知识库里选择文件,所以在openassistant的数据库里可能不存在
  45. for key in run.file_ids:
  46. if len(key) == 36:
  47. self.__keys.append(key) # 添加文件id 作为检索
  48. else:
  49. file_key.append(
  50. key
  51. ) ## assiatant的id数据,在r2r里没办法检索需要提取filekey字段
  52. print(
  53. "document_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_id"
  54. )
  55. # print(document_id)
  56. print(file_key)
  57. files = []
  58. # 这种情况是uuid.ex 这种格式的在最早的时候存在的,后续要去掉
  59. if len(file_key) > 0:
  60. ## 获取文件信息
  61. files = FileService.get_file_list_by_ids(session=session, file_ids=file_key)
  62. for file in files:
  63. self.__keys.append(file.key)
  64. print(files)
  65. # r2r接口不提供多条件,否则上面没必要存在
  66. """
  67. if len(document_id) > 0:
  68. filesinfo += FileService.list_in_files(ids=document_id, offset=0, limit=100)
  69. # asyncio.run(
  70. # FileService.list_in_files(ids=document_id, offset=0, limit=100)
  71. # )
  72. for file in filesinfo:
  73. self.__filenames.append(file.get("title"))
  74. self.__keys.append(file.get("id"))
  75. print(filesinfo)
  76. """
  77. # files = FileService.list_in_files(ids=run.file_ids, offset=0, limit=100)
  78. # 读取assistant的数据,获取文件夹的id
  79. db_asst = AssistantService.get_assistant_sync(
  80. session=session, assistant_id=run.assistant_id
  81. )
  82. if db_asst.tool_resources and "file_search" in db_asst.tool_resources:
  83. ##{"file_search": {"vector_store_ids": [{"file_ids": []}]}}
  84. asst_folder_ids = (
  85. db_asst.tool_resources.get("file_search")
  86. .get("vector_stores")[0]
  87. .get("folder_ids")
  88. )
  89. print(asst_folder_ids)
  90. # folder_fileinfo = []
  91. if asst_folder_ids:
  92. self.__dirkeys = asst_folder_ids
  93. """
  94. for fid in asst_folder_ids:
  95. folder_fileinfo += FileService.list_documents(
  96. id=fid, offset=0, limit=100
  97. )
  98. # folder_fileinfo += asyncio.run(
  99. # FileService.list_documents(id=fid, offset=0, limit=100)
  100. # )
  101. print(folder_fileinfo)
  102. for file in folder_fileinfo:
  103. self.__filenames.append(file.get("title"))
  104. self.__keys.append(file.get("id"))
  105. """
  106. # pre-cache data to prevent thread conflicts that may occur later on.
  107. print(
  108. "---------ssssssssssss-----------------sssssssssssss---------------ssssssssssssss-------------sssssssssssss-------------ss-------"
  109. )
  110. print(self.__dirkeys)
  111. """
  112. for file in files:
  113. self.__filenames.append(file.filename)
  114. self.__keys.append(file.key)
  115. """
  116. print(self.__keys)
  117. # indexes: List[int],
  118. def run(self, query: str) -> dict:
  119. print(
  120. "file_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keys"
  121. )
  122. print(self.__keys)
  123. print(self.__dirkeys)
  124. files = FileService.search_in_files(
  125. query=query, file_keys=self.__keys, folder_keys=self.__dirkeys
  126. )
  127. print(files)
  128. return files
  129. """
  130. file_keys = []
  131. for index in indexes:
  132. if index is not None:
  133. file_key = self.__keys[index]
  134. file_keys.append(file_key)
  135. print(file_keys)
  136. files = []
  137. if len(file_keys) > 0:
  138. # self.loop = asyncio.get_event_loop()
  139. # files = asyncio.run(
  140. # FileService.search_in_files(query=query, file_keys=file_keys)
  141. # )
  142. print(files)
  143. return files
  144. """
  145. def instruction_supplement(self) -> str:
  146. """
  147. 为 Retrieval 提供文件选择信息,用于 llm 调用抉择
  148. """
  149. # if len(self.__filenames) == 0:
  150. # return ""
  151. # else:
  152. filenames_info = [
  153. f"({index}){filename}" for index, filename in enumerate(self.__filenames)
  154. ]
  155. return (
  156. 'You can use the "retrieval" tool to retrieve relevant context from the following attached files. '
  157. + 'Each line represents a file in the format "(index)filename":\n'
  158. + "\n".join(filenames_info)
  159. + "\nMake sure to be extremely concise when using attached files. "
  160. )
  161. def instruction_supplement(self) -> str:
  162. """
  163. 为 Retrieval 提供文件选择信息,用于 llm 调用抉择
  164. return (
  165. 'You can use the "retrieval" tool to retrieve relevant context from the following attached files. '
  166. # + 'Each line represents a file in the format "(index)filename":\n'
  167. # + "\n".join(filenames_info)
  168. + "\nMake sure to be extremely concise when using attached files. "
  169. )
  170. """
  171. return 'You can use the "retrieval" tool to search for relevant information. Please specify the keywords or context you are looking for to retrieve the most relevant content.'
  172. #'You can use the "retrieval" to search for relevant information within the attached files. Please specify the keywords or context you are looking for to retrieve the most relevant content.'
  173. '''
  174. def configure(self, session: Session, run: Run, **kwargs):
  175. # 获取当前事件循环
  176. document_id = []
  177. file_key = []
  178. filesinfo = []
  179. # 后语要从知识库里选择文件,所以在openassistant的数据库里可能不存在
  180. for key in run.file_ids:
  181. if len(key) == 36:
  182. document_id.append(key)
  183. else:
  184. file_key.append(key)
  185. print(
  186. "document_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_id"
  187. )
  188. print(document_id)
  189. print(file_key)
  190. files = []
  191. # 这种情况是uuid.ex 这种格式的在最早的时候存在的,后续要去掉
  192. if len(file_key) > 0:
  193. ## 获取文件信息
  194. files = FileService.get_file_list_by_ids(session=session, file_ids=file_key)
  195. print(files)
  196. # r2r接口不提供多条件,否则上面没必要存在
  197. if len(document_id) > 0:
  198. filesinfo += FileService.list_in_files(ids=document_id, offset=0, limit=100)
  199. # asyncio.run(
  200. # FileService.list_in_files(ids=document_id, offset=0, limit=100)
  201. # )
  202. for file in filesinfo:
  203. self.__filenames.append(file.get("title"))
  204. self.__keys.append(file.get("id"))
  205. print(filesinfo)
  206. # files = FileService.list_in_files(ids=run.file_ids, offset=0, limit=100)
  207. db_asst = AssistantService.get_assistant_sync(
  208. session=session, assistant_id=run.assistant_id
  209. )
  210. if db_asst.tool_resources and "file_search" in db_asst.tool_resources:
  211. ##{"file_search": {"vector_store_ids": [{"file_ids": []}]}}
  212. asst_folder_ids = (
  213. db_asst.tool_resources.get("file_search")
  214. .get("vector_stores")[0]
  215. .get("folder_ids")
  216. )
  217. print(asst_folder_ids)
  218. folder_fileinfo = []
  219. if asst_folder_ids:
  220. for fid in asst_folder_ids:
  221. folder_fileinfo += FileService.list_documents(
  222. id=fid, offset=0, limit=100
  223. )
  224. # folder_fileinfo += asyncio.run(
  225. # FileService.list_documents(id=fid, offset=0, limit=100)
  226. # )
  227. print(folder_fileinfo)
  228. for file in folder_fileinfo:
  229. self.__filenames.append(file.get("title"))
  230. self.__keys.append(file.get("id"))
  231. # pre-cache data to prevent thread conflicts that may occur later on.
  232. print(
  233. "---------ssssssssssss-----------------sssssssssssss---------------ssssssssssssss-------------sssssssssssss-------------ss-------"
  234. )
  235. print(files)
  236. for file in files:
  237. self.__filenames.append(file.filename)
  238. self.__keys.append(file.key)
  239. print(self.__keys)
  240. def run(self, indexes: List[int], query: str) -> dict:
  241. file_keys = []
  242. for index in indexes:
  243. if index is not None:
  244. file_key = self.__keys[index]
  245. file_keys.append(file_key)
  246. print(
  247. "file_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keys"
  248. )
  249. print(file_keys)
  250. files = []
  251. if len(file_keys) > 0:
  252. # self.loop = asyncio.get_event_loop()
  253. files = FileService.search_in_files(query=query, file_keys=file_keys)
  254. # files = asyncio.run(
  255. # FileService.search_in_files(query=query, file_keys=file_keys)
  256. # )
  257. print(files)
  258. return files
  259. def instruction_supplement(self) -> str:
  260. """
  261. 为 Retrieval 提供文件选择信息,用于 llm 调用抉择
  262. """
  263. if len(self.__filenames) == 0:
  264. return ""
  265. else:
  266. filenames_info = [
  267. f"({index}){filename}"
  268. for index, filename in enumerate(self.__filenames)
  269. ]
  270. return (
  271. 'You can use the "retrieval" tool to retrieve relevant context from the following attached files. '
  272. + 'Each line represents a file in the format "(index)filename":\n'
  273. + "\n".join(filenames_info)
  274. + "\nMake sure to be extremely concise when using attached files. "
  275. )