advanced_kg_cookbook.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. import os
  2. import fire
  3. import requests
  4. from bs4 import BeautifulSoup, Comment
  5. from r2r import R2RClient
  6. def escape_braces(text):
  7. return text.replace("{", "{{").replace("}", "}}")
  8. def get_all_yc_co_directory_urls():
  9. this_file_path = os.path.abspath(os.path.dirname(__file__))
  10. yc_company_dump_path = os.path.join(
  11. this_file_path, "..", "data", "yc_companies.txt"
  12. )
  13. with open(yc_company_dump_path, "r") as f:
  14. urls = f.readlines()
  15. urls = [url.strip() for url in urls]
  16. return {url.split("/")[-1]: url for url in urls}
  17. # Function to fetch and clean HTML content
  18. def fetch_and_clean_yc_co_data(url):
  19. # Fetch the HTML content from the URL
  20. response = requests.get(url)
  21. response.raise_for_status() # Raise an error for bad status codes
  22. html_content = response.text
  23. # Parse the HTML content with BeautifulSoup
  24. soup = BeautifulSoup(html_content, "html.parser")
  25. # Remove all <script>, <style>, <meta>, <link>, <header>, <nav>, and <footer> elements
  26. for element in soup(
  27. ["script", "style", "meta", "link", "header", "nav", "footer"]
  28. ):
  29. element.decompose()
  30. # Remove comments
  31. for comment in soup.findAll(text=lambda text: isinstance(text, Comment)):
  32. comment.extract()
  33. # Select the main content (you can adjust the selector based on the structure of your target pages)
  34. main_content = soup.select_one("main") or soup.body
  35. if main_content:
  36. spans = main_content.find_all(["span", "a"])
  37. proc_spans = []
  38. for span in spans:
  39. proc_spans.append(span.get_text(separator=" ", strip=True))
  40. span_text = "\n".join(proc_spans)
  41. # Extract the text content from the main content
  42. paragraphs = main_content.find_all(
  43. ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]
  44. )
  45. cleaned_text = (
  46. "### Bulk:\n\n"
  47. + "\n\n".join(
  48. paragraph.get_text(separator=" ", strip=True)
  49. for paragraph in paragraphs
  50. )
  51. + "\n\n### Metadata:\n\n"
  52. + span_text
  53. )
  54. return cleaned_text
  55. else:
  56. return "Main content not found"
  57. def execute_query(provider, query, params={}):
  58. print(f"Executing query: {query}")
  59. with provider.client.session(database=provider._database) as session:
  60. result = session.run(query, params)
  61. return [record.data() for record in result]
  62. def main(
  63. max_entries=50,
  64. local_mode=True,
  65. base_url="http://localhost:7272",
  66. ):
  67. # Specify the entity types for the KG extraction prompt
  68. entity_types = [
  69. "COMPANY",
  70. "SCHOOL",
  71. "LOCATION",
  72. "PERSON",
  73. "DATE",
  74. "OTHER",
  75. "QUANTITY",
  76. "EVENT",
  77. "INDUSTRY",
  78. "MEDIA",
  79. ]
  80. # Specify the relations for the KG construction
  81. relations = [
  82. # Founder Relations
  83. "EDUCATED_AT",
  84. "WORKED_AT",
  85. "FOUNDED",
  86. # Company relations
  87. "RAISED",
  88. "REVENUE",
  89. "TEAM_SIZE",
  90. "LOCATION",
  91. "ACQUIRED_BY",
  92. "ANNOUNCED",
  93. "INDUSTRY",
  94. # Product relations
  95. "PRODUCT",
  96. "FEATURES",
  97. "TECHNOLOGY",
  98. # Additional relations
  99. "HAS",
  100. "AS_OF",
  101. "PARTICIPATED",
  102. "ASSOCIATED",
  103. ]
  104. client = R2RClient(base_url=base_url)
  105. prompt = "graphrag_relationships_extraction_few_shot"
  106. client.update_prompt(
  107. prompt,
  108. input_types={"entity_types": entity_types, "relations": relations},
  109. )
  110. url_map = get_all_yc_co_directory_urls()
  111. i = 0
  112. # Ingest and clean the data for each company
  113. for company, url in url_map.items():
  114. company_data = fetch_and_clean_yc_co_data(url)
  115. if i >= max_entries:
  116. break
  117. i += 1
  118. try:
  119. # Ingest as a text document
  120. file_name = f"{company}.txt"
  121. with open(file_name, "w") as f:
  122. f.write(company_data)
  123. client.ingest_files(
  124. [file_name],
  125. metadatas=[{"title": company}],
  126. )
  127. os.remove(file_name)
  128. except:
  129. continue
  130. if __name__ == "__main__":
  131. fire.Fire(main)