jack
/
r2rpy


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143
							# type: ignore
import re
from typing import AsyncGenerator

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
    CompletionProvider,
    DatabaseProvider,
    IngestionConfig,
)


class TSParser(AsyncParser[str | bytes]):
    """A parser for TypeScript source code files."""

    def __init__(
        self,
        config: IngestionConfig,
        database_provider: DatabaseProvider,
        llm_provider: CompletionProvider,
    ):
        self.database_provider = database_provider
        self.llm_provider = llm_provider
        self.config = config

    async def ingest(
        self, data: str | bytes, *args, **kwargs
    ) -> AsyncGenerator[str, None]:
        """Ingest TypeScript source code and yield structured text representation.

        Extracts JSDoc comments, function/class/interface definitions, and comments while
        preserving the code structure in a text format suitable for analysis.

        :param data: The TypeScript source code to parse.
        :param kwargs: Additional keyword arguments.
        """
        if isinstance(data, bytes):
            data = data.decode("utf-8", errors="ignore")

        # Process the TypeScript code
        processed_text = self._process_ts_code(data)

        # Yield the processed text
        yield processed_text

    def _process_ts_code(self, code: str) -> str:
        """Process TypeScript code into a more structured text representation.

        This method:
        1. Preserves file-level JSDoc comments
        2. Extracts imports and exports
        3. Extracts class, interface, type, and function definitions with their comments
        4. Preserves TypeScript-specific type annotations
        """
        # Split into lines for processing
        lines = code.splitlines()
        result = []

        # Extract file-level comments
        file_comment = self._extract_file_comment(code)
        if file_comment:
            result.append("FILE COMMENT:")
            result.append(file_comment)
            result.append("")

        # Extract imports and exports
        imports_exports = self._extract_imports_exports(lines)
        if imports_exports:
            result.append("IMPORTS/EXPORTS:")
            result.extend(imports_exports)
            result.append("")

        # Extract definitions (class, interface, type, function)
        definitions = self._extract_definitions(code)
        if definitions:
            result.append("DEFINITIONS:")
            result.extend(definitions)

        return "\n".join(result)

    def _extract_file_comment(self, code: str) -> str:
        """Extract the file-level JSDoc comment if present."""
        # Look for JSDoc comments at the beginning of the file
        file_comment_pattern = r"^\s*/\*\*(.*?)\*/\s*"
        match = re.search(file_comment_pattern, code, re.DOTALL)
        if match:
            # Format the comment by removing asterisks and preserving content
            comment = match.group(1)
            # Clean up the comment lines
            lines = [
                line.strip().lstrip("*").strip()
                for line in comment.split("\n")
            ]
            return "\n".join(line for line in lines if line)
        return ""

    def _extract_imports_exports(self, lines: list[str]) -> list[str]:
        """Extract import and export statements from the code."""
        statements = []
        for line in lines:
            line = line.strip()
            if (
                line.startswith(("import ", "export "))
                or re.match(r"^(import|export)\s+\{", line)
            ) and not line.startswith("//"):
                statements.append(line)
        return statements

    def _extract_definitions(self, code: str) -> list[str]:
        """Extract class, interface, type, and function definitions with their comments."""
        definitions = []

        # Pattern for definitions with preceding JSDoc comments
        # This captures JSDoc comments, export keywords, and various TypeScript definitions
        pattern = r"(?:/\*\*(.*?)\*/\s*)?(?:export\s+)?(?:(class|interface|type|enum|function|const|let|var)\s+\w+[\s\S]*?(?:\{|=>|;))"

        matches = re.finditer(pattern, code, re.DOTALL)

        for match in matches:
            jsdoc = match.group(1)
            definition = match.group(2) and match.group(0)[match.start(2) :]

            if jsdoc:
                # Format the JSDoc comment
                lines = [
                    line.strip().lstrip("*").strip()
                    for line in jsdoc.split("\n")
                ]
                formatted_jsdoc = "\n".join(line for line in lines if line)
                definitions.append(formatted_jsdoc)

            if definition:
                # Extract the first line or meaningful part of the definition
                def_lines = definition.strip().split("\n")
                if len(def_lines) > 3:  # If definition is long, abbreviate
                    short_def = "\n".join(def_lines[:3]) + "\n..."
                    definitions.append(short_def)
                else:
                    definitions.append(definition.strip())

            definitions.append("")  # Add empty line for readability

        return definitions