file_search_tool.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178
  1. from typing import Type, List
  2. from pydantic import BaseModel, Field
  3. from sqlalchemy.orm import Session
  4. from app.core.tools.base_tool import BaseTool
  5. from app.models.run import Run
  6. from app.services.file.file import FileService
  7. from app.services.assistant.assistant import AssistantService
  8. # import asyncio
  9. import nest_asyncio
  10. # 使得异步代码可以在已运行的事件循环中嵌套
  11. nest_asyncio.apply()
  12. # asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
  13. # query: str = Field(..., description="query to look up in retrieval")
  14. class FileSearchToolInput(BaseModel):
  15. # query: str = Field(
  16. # ...,
  17. # description="query to look up in retrieval",
  18. # )
  19. query: str = Field(
  20. ...,
  21. description="query to look up in retrieval",
  22. )
  23. class FileSearchTool(BaseTool):
  24. name: str = "file_search"
  25. description: str = (
  26. "## 工具说明:这里是用户上传的文件汇总成知识库,可以根据用户输入的问题从文件知识库中检索相关的内容。## 注意:仅允许每次调用一次。"
  27. )
  28. args_schema: Type[BaseModel] = FileSearchToolInput
  29. def __init__(self) -> None:
  30. super().__init__()
  31. self.__filenames = []
  32. self.__keys = []
  33. self.__dirkeys = []
  34. self.loop = None
  35. self.index = 0
  36. def configure(self, session: Session, run: Run, **kwargs):
  37. # 获取当前事件循环
  38. # document_id = []
  39. file_key = []
  40. # filesinfo = []
  41. # 后语要从知识库里选择文件,所以在openassistant的数据库里可能不存在
  42. for key in run.file_ids:
  43. if len(key) == 36:
  44. self.__keys.append(key) # 添加文件id 作为检索
  45. else:
  46. file_key.append(
  47. key
  48. ) ## assiatant的id数据,在r2r里没办法检索需要提取filekey字段
  49. print(
  50. "document_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_id"
  51. )
  52. # print(document_id)
  53. print(file_key)
  54. files = []
  55. # 这种情况是uuid.ex 这种格式的在最早的时候存在的,后续要去掉
  56. if len(file_key) > 0:
  57. ## 获取文件信息
  58. files = FileService.get_file_list_by_ids(session=session, file_ids=file_key)
  59. for file in files:
  60. self.__keys.append(file.key)
  61. print(files)
  62. # r2r接口不提供多条件,否则上面没必要存在
  63. """
  64. if len(document_id) > 0:
  65. filesinfo += FileService.list_in_files(ids=document_id, offset=0, limit=100)
  66. # asyncio.run(
  67. # FileService.list_in_files(ids=document_id, offset=0, limit=100)
  68. # )
  69. for file in filesinfo:
  70. self.__filenames.append(file.get("title"))
  71. self.__keys.append(file.get("id"))
  72. print(filesinfo)
  73. """
  74. # files = FileService.list_in_files(ids=run.file_ids, offset=0, limit=100)
  75. # 读取assistant的数据,获取文件夹的id
  76. db_asst = AssistantService.get_assistant_sync(
  77. session=session, assistant_id=run.assistant_id
  78. )
  79. if db_asst.tool_resources and "file_search" in db_asst.tool_resources:
  80. ##{"file_search": {"vector_store_ids": [{"file_ids": []}]}}
  81. asst_folder_ids = (
  82. db_asst.tool_resources.get("file_search")
  83. .get("vector_stores")[0]
  84. .get("folder_ids")
  85. )
  86. print(asst_folder_ids)
  87. # folder_fileinfo = []
  88. if asst_folder_ids:
  89. self.__dirkeys = asst_folder_ids
  90. """
  91. for fid in asst_folder_ids:
  92. folder_fileinfo += FileService.list_documents(
  93. id=fid, offset=0, limit=100
  94. )
  95. # folder_fileinfo += asyncio.run(
  96. # FileService.list_documents(id=fid, offset=0, limit=100)
  97. # )
  98. print(folder_fileinfo)
  99. for file in folder_fileinfo:
  100. self.__filenames.append(file.get("title"))
  101. self.__keys.append(file.get("id"))
  102. """
  103. # pre-cache data to prevent thread conflicts that may occur later on.
  104. print(
  105. "---------ssssssssssss-----------------sssssssssssss---------------ssssssssssssss-------------sssssssssssss-------------ss-------"
  106. )
  107. print(self.__dirkeys)
  108. """
  109. for file in files:
  110. self.__filenames.append(file.filename)
  111. self.__keys.append(file.key)
  112. """
  113. print(self.__keys)
  114. # indexes: List[int],
  115. def run(self, query: str) -> dict:
  116. print(
  117. "file_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keys"
  118. )
  119. print(self.__keys)
  120. print(self.__dirkeys)
  121. files = []
  122. if self.index == 0:
  123. files = FileService.search_in_files(
  124. query=query, file_keys=self.__keys, folder_keys=self.__dirkeys
  125. )
  126. self.index = 1
  127. print(files)
  128. return files
  129. def instruction_supplement(self) -> str:
  130. """
  131. 为 Retrieval 提供文件选择信息,用于 llm 调用抉择
  132. """
  133. if (self.__keys and len(self.__keys) > 0) or (
  134. self.__dirkeys and len(self.__dirkeys) > 0
  135. ):
  136. return "" # "## 能力限制:每次请求将会限制为最多使用三种工具,并且每种工具每次只能使用一次。这样可以保证每次操作都更加简洁和有效。" # "## Top Important: Please use the 'file_search' tool to search for relevant content or keywords. Summarize the content or keywords and provide them for 'file_search' tool usage. Please ensure that the 'file_search' tool is used only once per request."
  137. else:
  138. return ""
  139. # return '## important:You can use the "retrieval" tool to search for relevant information.\n If you are asking about the content of the files, please specify any keywords, topics, or context you are looking for to help retrieve the most relevant content.'
  140. """
  141. return (
  142. "## 工具使用规范"
  143. + "可调用工具:"
  144. + "- file_search:根据关键词或短语检索指定文件中的内容,返回匹配的文本片段的内容"
  145. + "**调用规则**:"
  146. + "1. 当问题涉及以下情况时必须调用本工具:"
  147. + " - 询问文件/文档中的具体内容"
  148. + " - 需要查找数据、条款或技术细节"
  149. + ' - 用户明确要求"查文件"或"搜索资料"'
  150. + "2. 调用时需遵循:"
  151. + " ```json"
  152. + " {"
  153. + ' "action": "file_search",'
  154. + ' "action_input": {'
  155. + ' "query": "精炼后的要搜索的关键词或短语,支持精确匹配"'
  156. + " }"
  157. + " }"
  158. )
  159. """