ts_parser.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
  1. # type: ignore
  2. import re
  3. from typing import AsyncGenerator
  4. from core.base.parsers.base_parser import AsyncParser
  5. from core.base.providers import (
  6. CompletionProvider,
  7. DatabaseProvider,
  8. IngestionConfig,
  9. )
  10. class TSParser(AsyncParser[str | bytes]):
  11. """A parser for TypeScript source code files."""
  12. def __init__(
  13. self,
  14. config: IngestionConfig,
  15. database_provider: DatabaseProvider,
  16. llm_provider: CompletionProvider,
  17. ):
  18. self.database_provider = database_provider
  19. self.llm_provider = llm_provider
  20. self.config = config
  21. async def ingest(
  22. self, data: str | bytes, *args, **kwargs
  23. ) -> AsyncGenerator[str, None]:
  24. """Ingest TypeScript source code and yield structured text representation.
  25. Extracts JSDoc comments, function/class/interface definitions, and comments while
  26. preserving the code structure in a text format suitable for analysis.
  27. :param data: The TypeScript source code to parse.
  28. :param kwargs: Additional keyword arguments.
  29. """
  30. if isinstance(data, bytes):
  31. data = data.decode("utf-8", errors="ignore")
  32. # Process the TypeScript code
  33. processed_text = self._process_ts_code(data)
  34. # Yield the processed text
  35. yield processed_text
  36. def _process_ts_code(self, code: str) -> str:
  37. """Process TypeScript code into a more structured text representation.
  38. This method:
  39. 1. Preserves file-level JSDoc comments
  40. 2. Extracts imports and exports
  41. 3. Extracts class, interface, type, and function definitions with their comments
  42. 4. Preserves TypeScript-specific type annotations
  43. """
  44. # Split into lines for processing
  45. lines = code.splitlines()
  46. result = []
  47. # Extract file-level comments
  48. file_comment = self._extract_file_comment(code)
  49. if file_comment:
  50. result.append("FILE COMMENT:")
  51. result.append(file_comment)
  52. result.append("")
  53. # Extract imports and exports
  54. imports_exports = self._extract_imports_exports(lines)
  55. if imports_exports:
  56. result.append("IMPORTS/EXPORTS:")
  57. result.extend(imports_exports)
  58. result.append("")
  59. # Extract definitions (class, interface, type, function)
  60. definitions = self._extract_definitions(code)
  61. if definitions:
  62. result.append("DEFINITIONS:")
  63. result.extend(definitions)
  64. return "\n".join(result)
  65. def _extract_file_comment(self, code: str) -> str:
  66. """Extract the file-level JSDoc comment if present."""
  67. # Look for JSDoc comments at the beginning of the file
  68. file_comment_pattern = r"^\s*/\*\*(.*?)\*/\s*"
  69. match = re.search(file_comment_pattern, code, re.DOTALL)
  70. if match:
  71. # Format the comment by removing asterisks and preserving content
  72. comment = match.group(1)
  73. # Clean up the comment lines
  74. lines = [
  75. line.strip().lstrip("*").strip()
  76. for line in comment.split("\n")
  77. ]
  78. return "\n".join(line for line in lines if line)
  79. return ""
  80. def _extract_imports_exports(self, lines: list[str]) -> list[str]:
  81. """Extract import and export statements from the code."""
  82. statements = []
  83. for line in lines:
  84. line = line.strip()
  85. if (
  86. line.startswith(("import ", "export "))
  87. or re.match(r"^(import|export)\s+\{", line)
  88. ) and not line.startswith("//"):
  89. statements.append(line)
  90. return statements
  91. def _extract_definitions(self, code: str) -> list[str]:
  92. """Extract class, interface, type, and function definitions with their comments."""
  93. definitions = []
  94. # Pattern for definitions with preceding JSDoc comments
  95. # This captures JSDoc comments, export keywords, and various TypeScript definitions
  96. pattern = r"(?:/\*\*(.*?)\*/\s*)?(?:export\s+)?(?:(class|interface|type|enum|function|const|let|var)\s+\w+[\s\S]*?(?:\{|=>|;))"
  97. matches = re.finditer(pattern, code, re.DOTALL)
  98. for match in matches:
  99. jsdoc = match.group(1)
  100. definition = match.group(2) and match.group(0)[match.start(2) :]
  101. if jsdoc:
  102. # Format the JSDoc comment
  103. lines = [
  104. line.strip().lstrip("*").strip()
  105. for line in jsdoc.split("\n")
  106. ]
  107. formatted_jsdoc = "\n".join(line for line in lines if line)
  108. definitions.append(formatted_jsdoc)
  109. if definition:
  110. # Extract the first line or meaningful part of the definition
  111. def_lines = definition.strip().split("\n")
  112. if len(def_lines) > 3: # If definition is long, abbreviate
  113. short_def = "\n".join(def_lines[:3]) + "\n..."
  114. definitions.append(short_def)
  115. else:
  116. definitions.append(definition.strip())
  117. definitions.append("") # Add empty line for readability
  118. return definitions