jack
/
openassistant


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134
							from typing import Type, List

from pydantic import BaseModel, Field
from sqlalchemy.orm import Session

from app.core.tools.base_tool import BaseTool
from app.models.run import Run
from app.services.file.file import FileService
from app.services.assistant.assistant import AssistantService


# return '## important：You can use the "retrieval" tool to search for relevant information.\n If you are asking about the content of the files, please specify any keywords, topics, or context you are looking for to help retrieve the most relevant content.'


# query: str = Field(
#    ...,
#    description="query to look up in retrieval",
# )
# asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
# query: str = Field(..., description="query to look up in retrieval")
class FileSearchToolInput(BaseModel):
    query: str = Field(
        ...,
        description="query to look up in retrieval",
    )


class FileSearchTool(BaseTool):
    name: str = "file_search"
    description: str = (
        # "Can be used to search through content of files uploaded by the user."
        # + "If the user references specific file content (e.g., 'in my uploaded document...'), this function should be triggered."
        # + "Singleton operation: Strictly 1 invocation per API call"
        "Use this function to search through the content of files uploaded by the user. "
        + "Trigger this function whenever the user refers to content within any uploaded document or file, even if they do not specify a file name or type. "
        + "If multiple files are uploaded and the user does not indicate a specific file, perform the search across all available uploaded files and return relevant results from each, clearly stating which file each result comes from. "
        + "If the user's request is ambiguous, default to considering all relevant uploaded files, and, if possible, provide a brief summary of the contents of each file to help clarify. "
        + "If the user has uploaded files in multiple batches, and their request is ambiguous (e.g. 'summarize the documents'), default to summarizing only the most recent batch of uploaded files. If the user's intent is to include older files, they should specify this explicitly."
        + "Prioritize providing useful information by erring on the side of inclusion rather than exclusion when the user's intent is not explicit. "
        + "Singleton operation: Strictly 1 invocation per API call."
    )
    args_schema: Type[BaseModel] = FileSearchToolInput

    def __init__(self) -> None:
        super().__init__()
        self.__filenames = []
        self.__keys = []
        self.__dirkeys = []
        self.loop = None
        self.index = 0

    def configure(self, session: Session, run: Run, **kwargs):
        # 获取当前事件循环
        # document_id = []
        file_key = []
        # filesinfo = []
        # 后语要从知识库里选择文件，所以在openassistant的数据库里可能不存在
        for key in run.file_ids:
            if len(key) == 36:
                self.__keys.append(key)  # 添加文件id 作为检索
            else:
                file_key.append(
                    key
                )  ## assiatant的id数据，在r2r里没办法检索需要提取filekey字段

        print(
            "document_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_iddocument_id"
        )
        # print(document_id)
        print(file_key)
        files = []
        # 这种情况是uuid.ex 这种格式的在最早的时候存在的，后续要去掉
        if len(file_key) > 0:
            ## 获取文件信息
            files = FileService.get_file_list_by_ids(session=session, file_ids=file_key)
            for file in files:
                self.__keys.append(file.key)
            print(files)
        """
        # 读取assistant的数据，获取文件夹的id
        db_asst = AssistantService.get_assistant_sync(
            session=session, assistant_id=run.assistant_id
        )

        if db_asst.tool_resources and "file_search" in db_asst.tool_resources:
            ##{"file_search": {"vector_store_ids": [{"file_ids": []}]}}
            asst_folder_ids = (
                db_asst.tool_resources.get("file_search")
                .get("vector_stores")[0]
                .get("folder_ids")
            )
            print(asst_folder_ids)
            # folder_fileinfo = []
            if asst_folder_ids:
                self.__dirkeys = asst_folder_ids
        """
        # pre-cache data to prevent thread conflicts that may occur later on.
        print(
            "---------ssssssssssss-----------------sssssssssssss---------------ssssssssssssss-------------sssssssssssss-------------ss-------"
        )
        print(self.__dirkeys)
        print(self.__keys)

    # indexes: List[int],
    def run(self, query: str) -> dict:
        print(
            "file_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keysfile_keys"
        )

        print(self.__keys)
        print(self.__dirkeys)
        files = []
        ## 必须有总结的内容query和才能触发
        if self.index == 0 and query:
            try:
                files = FileService.search_in_files(
                    query=query, file_keys=self.__keys, folder_keys=self.__dirkeys
                )
                self.index = 1
            except Exception as e:
                print(e)
        # print(files)
        return files

    def instruction_supplement(self) -> str:
        """
        为 Retrieval 提供文件选择信息，用于 llm 调用抉择
        """
        if (self.__keys and len(self.__keys) > 0) or (
            self.__dirkeys and len(self.__dirkeys) > 0
        ):
            return ""
        else:
            return "如果您不确定用户发的文件内容或者代码库结构，请使用文件搜索工具读取内容并收集相关信息，不要瞎猜或者编造答案。"