123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208 |
- import logging
- from abc import ABC
- from enum import Enum
- from typing import TYPE_CHECKING, Any, ClassVar
- from pydantic import Field
- from core.base.abstractions import ChunkEnrichmentSettings
- from .base import AppConfig, Provider, ProviderConfig
- from .llm import CompletionProvider
- logger = logging.getLogger()
- if TYPE_CHECKING:
- from core.database import PostgresDatabaseProvider
- class IngestionConfig(ProviderConfig):
- _defaults: ClassVar[dict] = {
- "app": AppConfig(),
- "provider": "r2r",
- "excluded_parsers": ["mp4"],
- "chunking_strategy": "recursive",
- "chunk_enrichment_settings": ChunkEnrichmentSettings(),
- "extra_parsers": {},
- "audio_transcription_model": "openai/whisper-1",
- "vision_img_prompt_name": "vision_img",
- "vision_img_model": "openai/gpt-4o",
- "vision_pdf_prompt_name": "vision_pdf",
- "vision_pdf_model": "openai/gpt-4o",
- "skip_document_summary": False,
- "document_summary_system_prompt": "default_system",
- "document_summary_task_prompt": "default_summary",
- "chunks_for_document_summary": 128,
- "document_summary_model": "openai/gpt-4o-mini",
- "parser_overrides": {},
- "extra_fields": {},
- }
- provider: str = Field(
- default_factory=lambda: IngestionConfig._defaults["provider"]
- )
- excluded_parsers: list[str] = Field(
- default_factory=lambda: IngestionConfig._defaults["excluded_parsers"]
- )
- chunking_strategy: str = Field(
- default_factory=lambda: IngestionConfig._defaults["chunking_strategy"]
- )
- chunk_enrichment_settings: ChunkEnrichmentSettings = Field(
- default_factory=lambda: IngestionConfig._defaults[
- "chunk_enrichment_settings"
- ]
- )
- extra_parsers: dict[str, Any] = Field(
- default_factory=lambda: IngestionConfig._defaults["extra_parsers"]
- )
- audio_transcription_model: str = Field(
- default_factory=lambda: IngestionConfig._defaults[
- "audio_transcription_model"
- ]
- )
- vision_img_prompt_name: str = Field(
- default_factory=lambda: IngestionConfig._defaults[
- "vision_img_prompt_name"
- ]
- )
- vision_img_model: str = Field(
- default_factory=lambda: IngestionConfig._defaults["vision_img_model"]
- )
- vision_pdf_prompt_name: str = Field(
- default_factory=lambda: IngestionConfig._defaults[
- "vision_pdf_prompt_name"
- ]
- )
- vision_pdf_model: str = Field(
- default_factory=lambda: IngestionConfig._defaults["vision_pdf_model"]
- )
- skip_document_summary: bool = Field(
- default_factory=lambda: IngestionConfig._defaults[
- "skip_document_summary"
- ]
- )
- document_summary_system_prompt: str = Field(
- default_factory=lambda: IngestionConfig._defaults[
- "document_summary_system_prompt"
- ]
- )
- document_summary_task_prompt: str = Field(
- default_factory=lambda: IngestionConfig._defaults[
- "document_summary_task_prompt"
- ]
- )
- chunks_for_document_summary: int = Field(
- default_factory=lambda: IngestionConfig._defaults[
- "chunks_for_document_summary"
- ]
- )
- document_summary_model: str = Field(
- default_factory=lambda: IngestionConfig._defaults[
- "document_summary_model"
- ]
- )
- parser_overrides: dict[str, str] = Field(
- default_factory=lambda: IngestionConfig._defaults["parser_overrides"]
- )
- @classmethod
- def set_default(cls, **kwargs):
- for key, value in kwargs.items():
- if key in cls._defaults:
- cls._defaults[key] = value
- else:
- raise AttributeError(
- f"No default attribute '{key}' in IngestionConfig"
- )
- @property
- def supported_providers(self) -> list[str]:
- return ["r2r", "unstructured_local", "unstructured_api"]
- def validate_config(self) -> None:
- if self.provider not in self.supported_providers:
- raise ValueError(f"Provider {self.provider} is not supported.")
- @classmethod
- def get_default(cls, mode: str, app) -> "IngestionConfig":
- """Return default ingestion configuration for a given mode."""
- if mode == "hi-res":
- return cls(app=app, parser_overrides={"pdf": "zerox"})
- else:
- return cls(app=app)
- @classmethod
- def get_default(cls, mode: str, app) -> "IngestionConfig":
- """Return default ingestion configuration for a given mode."""
- if mode == "hi-res":
- # More thorough parsing, no skipping summaries, possibly larger `chunks_for_document_summary`.
- return cls(app=app, parser_overrides={"pdf": "zerox"})
- # elif mode == "fast":
- # # Skip summaries and other enrichment steps for speed.
- # return cls(
- # app=app,
- # )
- else:
- # For `custom` or any unrecognized mode, return a base config
- return cls(app=app)
- @classmethod
- def set_default(cls, **kwargs):
- for key, value in kwargs.items():
- if key in cls._defaults:
- cls._defaults[key] = value
- else:
- raise AttributeError(
- f"No default attribute '{key}' in GenerationConfig"
- )
- class Config:
- populate_by_name = True
- json_schema_extra = {
- "provider": "r2r",
- "excluded_parsers": ["mp4"],
- "chunking_strategy": "recursive",
- "chunk_enrichment_settings": ChunkEnrichmentSettings().dict(),
- "extra_parsers": {},
- "audio_transcription_model": "openai/whisper-1",
- "vision_img_prompt_name": "vision_img",
- "vision_img_model": "openai/gpt-4o",
- "vision_pdf_prompt_name": "vision_pdf",
- "vision_pdf_model": "openai/gpt-4o",
- "skip_document_summary": False,
- "document_summary_system_prompt": "default_system",
- "document_summary_task_prompt": "default_summary",
- "chunks_for_document_summary": 128,
- "document_summary_model": "openai/gpt-4o-mini",
- "parser_overrides": {},
- }
- class IngestionProvider(Provider, ABC):
- config: IngestionConfig
- database_provider: "PostgresDatabaseProvider"
- llm_provider: CompletionProvider
- def __init__(
- self,
- config: IngestionConfig,
- database_provider: "PostgresDatabaseProvider",
- llm_provider: CompletionProvider,
- ):
- super().__init__(config)
- self.config: IngestionConfig = config
- self.llm_provider = llm_provider
- self.database_provider: "PostgresDatabaseProvider" = database_provider
- class ChunkingStrategy(str, Enum):
- RECURSIVE = "recursive"
- CHARACTER = "character"
- BASIC = "basic"
- BY_TITLE = "by_title"
- class IngestionMode(str, Enum):
- hi_res = "hi-res"
- fast = "fast"
- custom = "custom"
|