ingestion.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. import logging
  2. from abc import ABC
  3. from enum import Enum
  4. from typing import TYPE_CHECKING, ClassVar
  5. from pydantic import BaseModel, Field
  6. from core.base.abstractions import ChunkEnrichmentSettings
  7. from .base import AppConfig, Provider, ProviderConfig
  8. from .llm import CompletionProvider
  9. logger = logging.getLogger()
  10. if TYPE_CHECKING:
  11. from core.database import PostgresDatabaseProvider
  12. class IngestionConfig(ProviderConfig):
  13. _defaults: ClassVar[dict] = {
  14. "app": AppConfig(),
  15. "provider": "r2r",
  16. "excluded_parsers": ["mp4"],
  17. "chunk_enrichment_settings": ChunkEnrichmentSettings(),
  18. "extra_parsers": {},
  19. "audio_transcription_model": "openai/whisper-1",
  20. "vision_img_prompt_name": "vision_img",
  21. "vision_img_model": "openai/gpt-4o",
  22. "vision_pdf_prompt_name": "vision_pdf",
  23. "vision_pdf_model": "openai/gpt-4o",
  24. "skip_document_summary": False,
  25. "document_summary_system_prompt": "default_system",
  26. "document_summary_task_prompt": "default_summary",
  27. "chunks_for_document_summary": 128,
  28. "document_summary_model": "openai/gpt-4o-mini",
  29. "parser_overrides": {},
  30. "extra_fields": {},
  31. }
  32. provider: str = Field(
  33. default_factory=lambda: IngestionConfig._defaults["provider"]
  34. )
  35. excluded_parsers: list[str] = Field(
  36. default_factory=lambda: IngestionConfig._defaults["excluded_parsers"]
  37. )
  38. chunk_enrichment_settings: ChunkEnrichmentSettings = Field(
  39. default_factory=lambda: IngestionConfig._defaults[
  40. "chunk_enrichment_settings"
  41. ]
  42. )
  43. extra_parsers: dict[str, str] = Field(
  44. default_factory=lambda: IngestionConfig._defaults["extra_parsers"]
  45. )
  46. audio_transcription_model: str = Field(
  47. default_factory=lambda: IngestionConfig._defaults[
  48. "audio_transcription_model"
  49. ]
  50. )
  51. vision_img_prompt_name: str = Field(
  52. default_factory=lambda: IngestionConfig._defaults[
  53. "vision_img_prompt_name"
  54. ]
  55. )
  56. vision_img_model: str = Field(
  57. default_factory=lambda: IngestionConfig._defaults["vision_img_model"]
  58. )
  59. vision_pdf_prompt_name: str = Field(
  60. default_factory=lambda: IngestionConfig._defaults[
  61. "vision_pdf_prompt_name"
  62. ]
  63. )
  64. vision_pdf_model: str = Field(
  65. default_factory=lambda: IngestionConfig._defaults["vision_pdf_model"]
  66. )
  67. skip_document_summary: bool = Field(
  68. default_factory=lambda: IngestionConfig._defaults[
  69. "skip_document_summary"
  70. ]
  71. )
  72. document_summary_system_prompt: str = Field(
  73. default_factory=lambda: IngestionConfig._defaults[
  74. "document_summary_system_prompt"
  75. ]
  76. )
  77. document_summary_task_prompt: str = Field(
  78. default_factory=lambda: IngestionConfig._defaults[
  79. "document_summary_task_prompt"
  80. ]
  81. )
  82. chunks_for_document_summary: int = Field(
  83. default_factory=lambda: IngestionConfig._defaults[
  84. "chunks_for_document_summary"
  85. ]
  86. )
  87. document_summary_model: str = Field(
  88. default_factory=lambda: IngestionConfig._defaults[
  89. "document_summary_model"
  90. ]
  91. )
  92. parser_overrides: dict[str, str] = Field(
  93. default_factory=lambda: IngestionConfig._defaults["parser_overrides"]
  94. )
  95. @classmethod
  96. def set_default(cls, **kwargs):
  97. for key, value in kwargs.items():
  98. if key in cls._defaults:
  99. cls._defaults[key] = value
  100. else:
  101. raise AttributeError(
  102. f"No default attribute '{key}' in IngestionConfig"
  103. )
  104. @property
  105. def supported_providers(self) -> list[str]:
  106. return ["r2r", "unstructured_local", "unstructured_api"]
  107. def validate_config(self) -> None:
  108. if self.provider not in self.supported_providers:
  109. raise ValueError(f"Provider {self.provider} is not supported.")
  110. @classmethod
  111. def get_default(cls, mode: str, app) -> "IngestionConfig":
  112. """Return default ingestion configuration for a given mode."""
  113. if mode == "hi-res":
  114. return cls(app=app, parser_overrides={"pdf": "zerox"})
  115. else:
  116. return cls(app=app)
  117. @classmethod
  118. def get_default(cls, mode: str, app) -> "IngestionConfig":
  119. """Return default ingestion configuration for a given mode."""
  120. if mode == "hi-res":
  121. # More thorough parsing, no skipping summaries, possibly larger `chunks_for_document_summary`.
  122. return cls(app=app, parser_overrides={"pdf": "zerox"})
  123. # elif mode == "fast":
  124. # # Skip summaries and other enrichment steps for speed.
  125. # return cls(
  126. # app=app,
  127. # )
  128. else:
  129. # For `custom` or any unrecognized mode, return a base config
  130. return cls(app=app)
  131. @classmethod
  132. def set_default(cls, **kwargs):
  133. for key, value in kwargs.items():
  134. if key in cls._defaults:
  135. cls._defaults[key] = value
  136. else:
  137. raise AttributeError(
  138. f"No default attribute '{key}' in GenerationConfig"
  139. )
  140. class Config:
  141. populate_by_name = True
  142. json_schema_extra = {
  143. "provider": "r2r",
  144. "excluded_parsers": ["mp4"],
  145. "chunk_enrichment_settings": ChunkEnrichmentSettings().dict(),
  146. "extra_parsers": {},
  147. "audio_transcription_model": "openai/whisper-1",
  148. "vision_img_prompt_name": "vision_img",
  149. "vision_img_model": "openai/gpt-4o",
  150. "vision_pdf_prompt_name": "vision_pdf",
  151. "vision_pdf_model": "openai/gpt-4o",
  152. "skip_document_summary": False,
  153. "document_summary_system_prompt": "default_system",
  154. "document_summary_task_prompt": "default_summary",
  155. "chunks_for_document_summary": 128,
  156. "document_summary_model": "openai/gpt-4o-mini",
  157. "parser_overrides": {},
  158. }
  159. class IngestionProvider(Provider, ABC):
  160. config: IngestionConfig
  161. database_provider: "PostgresDatabaseProvider"
  162. llm_provider: CompletionProvider
  163. def __init__(
  164. self,
  165. config: IngestionConfig,
  166. database_provider: "PostgresDatabaseProvider",
  167. llm_provider: CompletionProvider,
  168. ):
  169. super().__init__(config)
  170. self.config: IngestionConfig = config
  171. self.llm_provider = llm_provider
  172. self.database_provider: "PostgresDatabaseProvider" = database_provider
  173. class ChunkingStrategy(str, Enum):
  174. RECURSIVE = "recursive"
  175. CHARACTER = "character"
  176. BASIC = "basic"
  177. BY_TITLE = "by_title"
  178. class IngestionMode(str, Enum):
  179. hi_res = "hi-res"
  180. fast = "fast"
  181. custom = "custom"