ingestion.py 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. import logging
  2. from abc import ABC
  3. from enum import Enum
  4. from typing import TYPE_CHECKING, Any, ClassVar
  5. from pydantic import Field
  6. from core.base.abstractions import ChunkEnrichmentSettings
  7. from .base import AppConfig, Provider, ProviderConfig
  8. from .llm import CompletionProvider
  9. logger = logging.getLogger()
  10. if TYPE_CHECKING:
  11. from core.database import PostgresDatabaseProvider
  12. class IngestionConfig(ProviderConfig):
  13. _defaults: ClassVar[dict] = {
  14. "app": AppConfig(),
  15. "provider": "r2r",
  16. "excluded_parsers": ["mp4"],
  17. "chunking_strategy": "recursive",
  18. "chunk_enrichment_settings": ChunkEnrichmentSettings(),
  19. "extra_parsers": {},
  20. "audio_transcription_model": "openai/whisper-1",
  21. "vision_img_prompt_name": "vision_img",
  22. "vision_img_model": "openai/gpt-4o",
  23. "vision_pdf_prompt_name": "vision_pdf",
  24. "vision_pdf_model": "openai/gpt-4o",
  25. "skip_document_summary": False,
  26. "document_summary_system_prompt": "default_system",
  27. "document_summary_task_prompt": "default_summary",
  28. "chunks_for_document_summary": 128,
  29. "document_summary_model": "openai/gpt-4o-mini",
  30. "parser_overrides": {},
  31. "extra_fields": {},
  32. }
  33. provider: str = Field(
  34. default_factory=lambda: IngestionConfig._defaults["provider"]
  35. )
  36. excluded_parsers: list[str] = Field(
  37. default_factory=lambda: IngestionConfig._defaults["excluded_parsers"]
  38. )
  39. chunking_strategy: str = Field(
  40. default_factory=lambda: IngestionConfig._defaults["chunking_strategy"]
  41. )
  42. chunk_enrichment_settings: ChunkEnrichmentSettings = Field(
  43. default_factory=lambda: IngestionConfig._defaults[
  44. "chunk_enrichment_settings"
  45. ]
  46. )
  47. extra_parsers: dict[str, Any] = Field(
  48. default_factory=lambda: IngestionConfig._defaults["extra_parsers"]
  49. )
  50. audio_transcription_model: str = Field(
  51. default_factory=lambda: IngestionConfig._defaults[
  52. "audio_transcription_model"
  53. ]
  54. )
  55. vision_img_prompt_name: str = Field(
  56. default_factory=lambda: IngestionConfig._defaults[
  57. "vision_img_prompt_name"
  58. ]
  59. )
  60. vision_img_model: str = Field(
  61. default_factory=lambda: IngestionConfig._defaults["vision_img_model"]
  62. )
  63. vision_pdf_prompt_name: str = Field(
  64. default_factory=lambda: IngestionConfig._defaults[
  65. "vision_pdf_prompt_name"
  66. ]
  67. )
  68. vision_pdf_model: str = Field(
  69. default_factory=lambda: IngestionConfig._defaults["vision_pdf_model"]
  70. )
  71. skip_document_summary: bool = Field(
  72. default_factory=lambda: IngestionConfig._defaults[
  73. "skip_document_summary"
  74. ]
  75. )
  76. document_summary_system_prompt: str = Field(
  77. default_factory=lambda: IngestionConfig._defaults[
  78. "document_summary_system_prompt"
  79. ]
  80. )
  81. document_summary_task_prompt: str = Field(
  82. default_factory=lambda: IngestionConfig._defaults[
  83. "document_summary_task_prompt"
  84. ]
  85. )
  86. chunks_for_document_summary: int = Field(
  87. default_factory=lambda: IngestionConfig._defaults[
  88. "chunks_for_document_summary"
  89. ]
  90. )
  91. document_summary_model: str = Field(
  92. default_factory=lambda: IngestionConfig._defaults[
  93. "document_summary_model"
  94. ]
  95. )
  96. parser_overrides: dict[str, str] = Field(
  97. default_factory=lambda: IngestionConfig._defaults["parser_overrides"]
  98. )
  99. @classmethod
  100. def set_default(cls, **kwargs):
  101. for key, value in kwargs.items():
  102. if key in cls._defaults:
  103. cls._defaults[key] = value
  104. else:
  105. raise AttributeError(
  106. f"No default attribute '{key}' in IngestionConfig"
  107. )
  108. @property
  109. def supported_providers(self) -> list[str]:
  110. return ["r2r", "unstructured_local", "unstructured_api"]
  111. def validate_config(self) -> None:
  112. if self.provider not in self.supported_providers:
  113. raise ValueError(f"Provider {self.provider} is not supported.")
  114. @classmethod
  115. def get_default(cls, mode: str, app) -> "IngestionConfig":
  116. """Return default ingestion configuration for a given mode."""
  117. if mode == "hi-res":
  118. return cls(app=app, parser_overrides={"pdf": "zerox"})
  119. else:
  120. return cls(app=app)
  121. @classmethod
  122. def get_default(cls, mode: str, app) -> "IngestionConfig":
  123. """Return default ingestion configuration for a given mode."""
  124. if mode == "hi-res":
  125. # More thorough parsing, no skipping summaries, possibly larger `chunks_for_document_summary`.
  126. return cls(app=app, parser_overrides={"pdf": "zerox"})
  127. # elif mode == "fast":
  128. # # Skip summaries and other enrichment steps for speed.
  129. # return cls(
  130. # app=app,
  131. # )
  132. else:
  133. # For `custom` or any unrecognized mode, return a base config
  134. return cls(app=app)
  135. @classmethod
  136. def set_default(cls, **kwargs):
  137. for key, value in kwargs.items():
  138. if key in cls._defaults:
  139. cls._defaults[key] = value
  140. else:
  141. raise AttributeError(
  142. f"No default attribute '{key}' in GenerationConfig"
  143. )
  144. class Config:
  145. populate_by_name = True
  146. json_schema_extra = {
  147. "provider": "r2r",
  148. "excluded_parsers": ["mp4"],
  149. "chunking_strategy": "recursive",
  150. "chunk_enrichment_settings": ChunkEnrichmentSettings().dict(),
  151. "extra_parsers": {},
  152. "audio_transcription_model": "openai/whisper-1",
  153. "vision_img_prompt_name": "vision_img",
  154. "vision_img_model": "openai/gpt-4o",
  155. "vision_pdf_prompt_name": "vision_pdf",
  156. "vision_pdf_model": "openai/gpt-4o",
  157. "skip_document_summary": False,
  158. "document_summary_system_prompt": "default_system",
  159. "document_summary_task_prompt": "default_summary",
  160. "chunks_for_document_summary": 128,
  161. "document_summary_model": "openai/gpt-4o-mini",
  162. "parser_overrides": {},
  163. }
  164. class IngestionProvider(Provider, ABC):
  165. config: IngestionConfig
  166. database_provider: "PostgresDatabaseProvider"
  167. llm_provider: CompletionProvider
  168. def __init__(
  169. self,
  170. config: IngestionConfig,
  171. database_provider: "PostgresDatabaseProvider",
  172. llm_provider: CompletionProvider,
  173. ):
  174. super().__init__(config)
  175. self.config: IngestionConfig = config
  176. self.llm_provider = llm_provider
  177. self.database_provider: "PostgresDatabaseProvider" = database_provider
  178. class ChunkingStrategy(str, Enum):
  179. RECURSIVE = "recursive"
  180. CHARACTER = "character"
  181. BASIC = "basic"
  182. BY_TITLE = "by_title"
  183. class IngestionMode(str, Enum):
  184. hi_res = "hi-res"
  185. fast = "fast"
  186. custom = "custom"