js_parser.py 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. # type: ignore
  2. import re
  3. from typing import AsyncGenerator
  4. from core.base.parsers.base_parser import AsyncParser
  5. from core.base.providers import (
  6. CompletionProvider,
  7. DatabaseProvider,
  8. IngestionConfig,
  9. )
  10. class JSParser(AsyncParser[str | bytes]):
  11. """A parser for JavaScript files."""
  12. def __init__(
  13. self,
  14. config: IngestionConfig,
  15. database_provider: DatabaseProvider,
  16. llm_provider: CompletionProvider,
  17. ):
  18. self.database_provider = database_provider
  19. self.llm_provider = llm_provider
  20. self.config = config
  21. async def ingest(
  22. self, data: str | bytes, *args, **kwargs
  23. ) -> AsyncGenerator[str, None]:
  24. """Ingest JavaScript data and yield structured text representation.
  25. Extracts functions, classes, variable declarations, comments, and other
  26. important structures from JavaScript code in a text format suitable for analysis.
  27. :param data: The JavaScript content to parse
  28. :param kwargs: Additional keyword arguments
  29. """
  30. if isinstance(data, bytes):
  31. data = data.decode("utf-8", errors="ignore")
  32. # Process the JavaScript content
  33. processed_text = self._process_js_content(data)
  34. # Yield the processed text
  35. yield processed_text
  36. def _process_js_content(self, js: str) -> str:
  37. """Process JavaScript content into a structured text representation.
  38. This method:
  39. 1. Extracts and preserves comments
  40. 2. Identifies imports and exports
  41. 3. Extracts function and class definitions
  42. 4. Identifies variable declarations
  43. 5. Formats the JavaScript structure in a readable way
  44. """
  45. # Extract comments
  46. comments = self._extract_comments(js)
  47. # Extract imports and exports
  48. imports_exports = self._extract_imports_exports(js)
  49. # Extract function definitions
  50. functions = self._extract_functions(js)
  51. # Extract class definitions
  52. classes = self._extract_classes(js)
  53. # Extract variable declarations
  54. variables = self._extract_variables(js)
  55. # Build the result
  56. result = []
  57. if comments:
  58. result.append("COMMENTS:")
  59. result.extend(comments)
  60. result.append("")
  61. if imports_exports:
  62. result.append("IMPORTS AND EXPORTS:")
  63. result.extend(imports_exports)
  64. result.append("")
  65. if functions:
  66. result.append("FUNCTIONS:")
  67. result.extend(functions)
  68. result.append("")
  69. if classes:
  70. result.append("CLASSES:")
  71. result.extend(classes)
  72. result.append("")
  73. if variables:
  74. result.append("VARIABLE DECLARATIONS:")
  75. result.extend(variables)
  76. result.append("")
  77. return "\n".join(result)
  78. def _extract_comments(self, js: str) -> list[str]:
  79. """Extract comments from JavaScript content."""
  80. # Extract multi-line comments
  81. multiline_pattern = r"/\*(.*?)\*/"
  82. multiline_comments = re.findall(multiline_pattern, js, re.DOTALL)
  83. # Extract single-line comments
  84. singleline_pattern = r"//(.+)$"
  85. singleline_comments = re.findall(singleline_pattern, js, re.MULTILINE)
  86. comments = []
  87. # Add multi-line comments
  88. for comment in multiline_comments:
  89. formatted_comment = comment.strip()
  90. if formatted_comment:
  91. comments.append(formatted_comment)
  92. # Add single-line comments
  93. for comment in singleline_comments:
  94. formatted_comment = comment.strip()
  95. if formatted_comment:
  96. comments.append(formatted_comment)
  97. return comments
  98. def _extract_imports_exports(self, js: str) -> list[str]:
  99. """Extract import and export statements."""
  100. # Remove comments to simplify parsing
  101. js_without_comments = self._remove_comments(js)
  102. # Match import statements
  103. import_pattern = r"import\s+.*?;|import\s+.*?\s+from\s+.*?;"
  104. imports = re.findall(import_pattern, js_without_comments)
  105. # Match export statements
  106. export_pattern = (
  107. r"export\s+.*?;|export\s+default\s+.*?;|export\s+\{.*?\};"
  108. )
  109. exports = re.findall(export_pattern, js_without_comments)
  110. results = []
  111. for stmt in imports + exports:
  112. results.append(stmt.strip())
  113. return results
  114. def _extract_functions(self, js: str) -> list[str]:
  115. """Extract function definitions."""
  116. # Remove comments to simplify parsing
  117. js_without_comments = self._remove_comments(js)
  118. results = []
  119. # Match regular function declarations
  120. func_pattern = r"function\s+(\w+)\s*\([^)]*\)\s*\{[^{]*\}"
  121. funcs = re.finditer(func_pattern, js_without_comments)
  122. for func in funcs:
  123. # Get the function signature
  124. signature = func.group(0)
  125. # Extract just the function declaration line
  126. declaration = re.search(r"function\s+\w+\s*\([^)]*\)", signature)
  127. if declaration:
  128. results.append(declaration.group(0))
  129. # Match arrow functions with explicit names
  130. arrow_pattern = (
  131. r"(?:const|let|var)\s+(\w+)\s*=\s*(?:\([^)]*\)|[^=;]*)\s*=>\s*\{?"
  132. )
  133. arrows = re.finditer(arrow_pattern, js_without_comments)
  134. for arrow in arrows:
  135. results.append(arrow.group(0))
  136. # Match method definitions in objects and classes
  137. method_pattern = r"(\w+)\s*\([^)]*\)\s*\{"
  138. methods = re.finditer(method_pattern, js_without_comments)
  139. for method in methods:
  140. # Filter out if/for/while statements
  141. if not re.match(r"(if|for|while|switch)\s*\(", method.group(0)):
  142. results.append(method.group(0))
  143. return results
  144. def _extract_classes(self, js: str) -> list[str]:
  145. """Extract class definitions."""
  146. # Remove comments to simplify parsing
  147. js_without_comments = self._remove_comments(js)
  148. results = []
  149. # Match class declarations
  150. class_pattern = r"class\s+(\w+)(?:\s+extends\s+(\w+))?\s*\{"
  151. classes = re.finditer(class_pattern, js_without_comments)
  152. for cls in classes:
  153. results.append(cls.group(0))
  154. # Match class expressions
  155. class_expr_pattern = (
  156. r"(?:const|let|var)\s+(\w+)\s*=\s*class(?:\s+\w+)?\s*\{"
  157. )
  158. class_exprs = re.finditer(class_expr_pattern, js_without_comments)
  159. for cls_expr in class_exprs:
  160. results.append(cls_expr.group(0))
  161. return results
  162. def _extract_variables(self, js: str) -> list[str]:
  163. """Extract variable declarations."""
  164. # Remove comments to simplify parsing
  165. js_without_comments = self._remove_comments(js)
  166. # Match variable declarations (excluding function/class assignments)
  167. var_pattern = r"(?:const|let|var)\s+\w+(?:\s*=\s*[^=>{].*?)?;"
  168. vars_raw = re.finditer(var_pattern, js_without_comments)
  169. results = []
  170. for var in vars_raw:
  171. var_text = var.group(0).strip()
  172. # Skip function/arrow function assignments which are handled separately
  173. if not re.search(r"=\s*function|\s*=>\s*", var_text):
  174. results.append(var_text)
  175. return results
  176. def _remove_comments(self, js: str) -> str:
  177. """Remove comments from JavaScript code to simplify parsing."""
  178. # Remove multi-line comments
  179. js = re.sub(r"/\*.*?\*/", "", js, flags=re.DOTALL)
  180. # Remove single-line comments
  181. js = re.sub(r"//.*?$", "", js, flags=re.MULTILINE)
  182. return js