kg.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213
  1. from enum import Enum
  2. from pydantic import Field
  3. from .base import R2RSerializable
  4. from .llm import GenerationConfig
  5. class KGRunType(str, Enum):
  6. """Type of KG run."""
  7. ESTIMATE = "estimate"
  8. RUN = "run" # deprecated
  9. def __str__(self):
  10. return self.value
  11. GraphRunType = KGRunType
  12. class KGEntityDeduplicationType(str, Enum):
  13. """Type of KG entity deduplication."""
  14. BY_NAME = "by_name"
  15. BY_DESCRIPTION = "by_description"
  16. BY_LLM = "by_llm"
  17. def __str__(self):
  18. return self.value
  19. class KGCreationSettings(R2RSerializable):
  20. """Settings for knowledge graph creation."""
  21. clustering_mode: str = Field(
  22. default="local",
  23. description="Whether to use remote clustering for graph creation.",
  24. )
  25. graphrag_relationships_extraction_few_shot: str = Field(
  26. default="graphrag_relationships_extraction_few_shot",
  27. description="The prompt to use for knowledge graph extraction.",
  28. alias="graphrag_relationships_extraction_few_shot", # TODO - mark deprecated & remove
  29. )
  30. graph_entity_description_prompt: str = Field(
  31. default="graphrag_entity_description",
  32. description="The prompt to use for entity description generation.",
  33. alias="graphrag_entity_description_prompt", # TODO - mark deprecated & remove
  34. )
  35. entity_types: list[str] = Field(
  36. default=[],
  37. description="The types of entities to extract.",
  38. )
  39. relation_types: list[str] = Field(
  40. default=[],
  41. description="The types of relations to extract.",
  42. )
  43. chunk_merge_count: int = Field(
  44. default=4,
  45. description="The number of extractions to merge into a single KG extraction.",
  46. )
  47. max_knowledge_relationships: int = Field(
  48. default=100,
  49. description="The maximum number of knowledge relationships to extract from each chunk.",
  50. )
  51. max_description_input_length: int = Field(
  52. default=65536,
  53. description="The maximum length of the description for a node in the graph.",
  54. )
  55. generation_config: GenerationConfig = Field(
  56. default_factory=GenerationConfig,
  57. description="Configuration for text generation during graph enrichment.",
  58. )
  59. class KGEntityDeduplicationSettings(R2RSerializable):
  60. """Settings for knowledge graph entity deduplication."""
  61. graph_entity_deduplication_type: KGEntityDeduplicationType = Field(
  62. default=KGEntityDeduplicationType.BY_NAME,
  63. description="The type of entity deduplication to use.",
  64. )
  65. max_description_input_length: int = Field(
  66. default=65536,
  67. description="The maximum length of the description for a node in the graph.",
  68. )
  69. graph_entity_deduplication_prompt: str = Field(
  70. default="graphrag_entity_deduplication",
  71. description="The prompt to use for knowledge graph entity deduplication.",
  72. )
  73. generation_config: GenerationConfig = Field(
  74. default_factory=GenerationConfig,
  75. description="Configuration for text generation during graph entity deduplication.",
  76. )
  77. class KGEnrichmentSettings(R2RSerializable):
  78. """Settings for knowledge graph enrichment."""
  79. force_kg_enrichment: bool = Field(
  80. default=False,
  81. description="Force run the enrichment step even if graph creation is still in progress for some documents.",
  82. )
  83. graphrag_communities: str = Field(
  84. default="graphrag_communities",
  85. description="The prompt to use for knowledge graph enrichment.",
  86. alias="graphrag_communities", # TODO - mark deprecated & remove
  87. )
  88. max_summary_input_length: int = Field(
  89. default=65536,
  90. description="The maximum length of the summary for a community.",
  91. )
  92. generation_config: GenerationConfig = Field(
  93. default_factory=GenerationConfig,
  94. description="Configuration for text generation during graph enrichment.",
  95. )
  96. leiden_params: dict = Field(
  97. default_factory=dict,
  98. description="Parameters for the Leiden algorithm.",
  99. )
  100. class GraphEntitySettings(R2RSerializable):
  101. """Settings for knowledge graph entity creation."""
  102. graph_entity_deduplication_type: KGEntityDeduplicationType = Field(
  103. default=KGEntityDeduplicationType.BY_NAME,
  104. description="The type of entity deduplication to use.",
  105. )
  106. max_description_input_length: int = Field(
  107. default=65536,
  108. description="The maximum length of the description for a node in the graph.",
  109. )
  110. graph_entity_deduplication_prompt: str = Field(
  111. default="graphrag_entity_deduplication",
  112. description="The prompt to use for knowledge graph entity deduplication.",
  113. )
  114. generation_config: GenerationConfig = Field(
  115. default_factory=GenerationConfig,
  116. description="Configuration for text generation during graph entity deduplication.",
  117. )
  118. class GraphRelationshipSettings(R2RSerializable):
  119. """Settings for knowledge graph relationship creation."""
  120. pass
  121. class GraphCommunitySettings(R2RSerializable):
  122. """Settings for knowledge graph community enrichment."""
  123. force_kg_enrichment: bool = Field(
  124. default=False,
  125. description="Force run the enrichment step even if graph creation is still in progress for some documents.",
  126. )
  127. graphrag_communities: str = Field(
  128. default="graphrag_communities",
  129. description="The prompt to use for knowledge graph enrichment.",
  130. )
  131. max_summary_input_length: int = Field(
  132. default=65536,
  133. description="The maximum length of the summary for a community.",
  134. )
  135. generation_config: GenerationConfig = Field(
  136. default_factory=GenerationConfig,
  137. description="Configuration for text generation during graph enrichment.",
  138. )
  139. leiden_params: dict = Field(
  140. default_factory=dict,
  141. description="Parameters for the Leiden algorithm.",
  142. )
  143. class GraphBuildSettings(R2RSerializable):
  144. """Settings for knowledge graph build."""
  145. entity_settings: GraphEntitySettings = Field(
  146. default=GraphEntitySettings(),
  147. description="Settings for knowledge graph entity creation.",
  148. )
  149. relationship_settings: GraphRelationshipSettings = Field(
  150. default=GraphRelationshipSettings(),
  151. description="Settings for knowledge graph relationship creation.",
  152. )
  153. community_settings: GraphCommunitySettings = Field(
  154. default=GraphCommunitySettings(),
  155. description="Settings for knowledge graph community enrichment.",
  156. )