ingestion.py 1.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. # Abstractions for ingestion
  2. from enum import Enum
  3. from pydantic import Field
  4. from .base import R2RSerializable
  5. from .llm import GenerationConfig
  6. class ChunkEnrichmentStrategy(str, Enum):
  7. SEMANTIC = "semantic"
  8. NEIGHBORHOOD = "neighborhood"
  9. def __str__(self) -> str:
  10. return self.value
  11. class ChunkEnrichmentSettings(R2RSerializable):
  12. """
  13. Settings for chunk enrichment.
  14. """
  15. enable_chunk_enrichment: bool = Field(
  16. default=False,
  17. description="Whether to enable chunk enrichment or not",
  18. )
  19. strategies: list[ChunkEnrichmentStrategy] = Field(
  20. default=[],
  21. description="The strategies to use for chunk enrichment. Union of chunks obtained from each strategy is used as context.",
  22. )
  23. forward_chunks: int = Field(
  24. default=3,
  25. description="The number after the current chunk to include in the LLM context while enriching",
  26. )
  27. backward_chunks: int = Field(
  28. default=3,
  29. description="The number of chunks before the current chunk in the LLM context while enriching",
  30. )
  31. semantic_neighbors: int = Field(
  32. default=10, description="The number of semantic neighbors to include"
  33. )
  34. semantic_similarity_threshold: float = Field(
  35. default=0.7,
  36. description="The similarity threshold for semantic neighbors",
  37. )
  38. generation_config: GenerationConfig = Field(
  39. default=GenerationConfig(),
  40. description="The generation config to use for chunk enrichment",
  41. )