jack
/
r2rpy


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
							# type: ignore
import re
from typing import AsyncGenerator

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
    CompletionProvider,
    DatabaseProvider,
    IngestionConfig,
)


class JSParser(AsyncParser[str | bytes]):
    """A parser for JavaScript files."""

    def __init__(
        self,
        config: IngestionConfig,
        database_provider: DatabaseProvider,
        llm_provider: CompletionProvider,
    ):
        self.database_provider = database_provider
        self.llm_provider = llm_provider
        self.config = config

    async def ingest(
        self, data: str | bytes, *args, **kwargs
    ) -> AsyncGenerator[str, None]:
        """Ingest JavaScript data and yield structured text representation.

        Extracts functions, classes, variable declarations, comments, and other
        important structures from JavaScript code in a text format suitable for analysis.

        :param data: The JavaScript content to parse
        :param kwargs: Additional keyword arguments
        """
        if isinstance(data, bytes):
            data = data.decode("utf-8", errors="ignore")

        # Process the JavaScript content
        processed_text = self._process_js_content(data)

        # Yield the processed text
        yield processed_text

    def _process_js_content(self, js: str) -> str:
        """Process JavaScript content into a structured text representation.

        This method:
        1. Extracts and preserves comments
        2. Identifies imports and exports
        3. Extracts function and class definitions
        4. Identifies variable declarations
        5. Formats the JavaScript structure in a readable way
        """
        # Extract comments
        comments = self._extract_comments(js)

        # Extract imports and exports
        imports_exports = self._extract_imports_exports(js)

        # Extract function definitions
        functions = self._extract_functions(js)

        # Extract class definitions
        classes = self._extract_classes(js)

        # Extract variable declarations
        variables = self._extract_variables(js)

        # Build the result
        result = []

        if comments:
            result.append("COMMENTS:")
            result.extend(comments)
            result.append("")

        if imports_exports:
            result.append("IMPORTS AND EXPORTS:")
            result.extend(imports_exports)
            result.append("")

        if functions:
            result.append("FUNCTIONS:")
            result.extend(functions)
            result.append("")

        if classes:
            result.append("CLASSES:")
            result.extend(classes)
            result.append("")

        if variables:
            result.append("VARIABLE DECLARATIONS:")
            result.extend(variables)
            result.append("")

        return "\n".join(result)

    def _extract_comments(self, js: str) -> list[str]:
        """Extract comments from JavaScript content."""
        # Extract multi-line comments
        multiline_pattern = r"/\*(.*?)\*/"
        multiline_comments = re.findall(multiline_pattern, js, re.DOTALL)

        # Extract single-line comments
        singleline_pattern = r"//(.+)$"
        singleline_comments = re.findall(singleline_pattern, js, re.MULTILINE)

        comments = []
        # Add multi-line comments
        for comment in multiline_comments:
            formatted_comment = comment.strip()
            if formatted_comment:
                comments.append(formatted_comment)

        # Add single-line comments
        for comment in singleline_comments:
            formatted_comment = comment.strip()
            if formatted_comment:
                comments.append(formatted_comment)

        return comments

    def _extract_imports_exports(self, js: str) -> list[str]:
        """Extract import and export statements."""
        # Remove comments to simplify parsing
        js_without_comments = self._remove_comments(js)

        # Match import statements
        import_pattern = r"import\s+.*?;|import\s+.*?\s+from\s+.*?;"
        imports = re.findall(import_pattern, js_without_comments)

        # Match export statements
        export_pattern = (
            r"export\s+.*?;|export\s+default\s+.*?;|export\s+\{.*?\};"
        )
        exports = re.findall(export_pattern, js_without_comments)

        results = []
        for stmt in imports + exports:
            results.append(stmt.strip())

        return results

    def _extract_functions(self, js: str) -> list[str]:
        """Extract function definitions."""
        # Remove comments to simplify parsing
        js_without_comments = self._remove_comments(js)

        results = []

        # Match regular function declarations
        func_pattern = r"function\s+(\w+)\s*\([^)]*\)\s*\{[^{]*\}"
        funcs = re.finditer(func_pattern, js_without_comments)
        for func in funcs:
            # Get the function signature
            signature = func.group(0)
            # Extract just the function declaration line
            declaration = re.search(r"function\s+\w+\s*\([^)]*\)", signature)
            if declaration:
                results.append(declaration.group(0))

        # Match arrow functions with explicit names
        arrow_pattern = (
            r"(?:const|let|var)\s+(\w+)\s*=\s*(?:\([^)]*\)|[^=;]*)\s*=>\s*\{?"
        )
        arrows = re.finditer(arrow_pattern, js_without_comments)
        for arrow in arrows:
            results.append(arrow.group(0))

        # Match method definitions in objects and classes
        method_pattern = r"(\w+)\s*\([^)]*\)\s*\{"
        methods = re.finditer(method_pattern, js_without_comments)
        for method in methods:
            # Filter out if/for/while statements
            if not re.match(r"(if|for|while|switch)\s*\(", method.group(0)):
                results.append(method.group(0))

        return results

    def _extract_classes(self, js: str) -> list[str]:
        """Extract class definitions."""
        # Remove comments to simplify parsing
        js_without_comments = self._remove_comments(js)

        results = []

        # Match class declarations
        class_pattern = r"class\s+(\w+)(?:\s+extends\s+(\w+))?\s*\{"
        classes = re.finditer(class_pattern, js_without_comments)
        for cls in classes:
            results.append(cls.group(0))

        # Match class expressions
        class_expr_pattern = (
            r"(?:const|let|var)\s+(\w+)\s*=\s*class(?:\s+\w+)?\s*\{"
        )
        class_exprs = re.finditer(class_expr_pattern, js_without_comments)
        for cls_expr in class_exprs:
            results.append(cls_expr.group(0))

        return results

    def _extract_variables(self, js: str) -> list[str]:
        """Extract variable declarations."""
        # Remove comments to simplify parsing
        js_without_comments = self._remove_comments(js)

        # Match variable declarations (excluding function/class assignments)
        var_pattern = r"(?:const|let|var)\s+\w+(?:\s*=\s*[^=>{].*?)?;"
        vars_raw = re.finditer(var_pattern, js_without_comments)

        results = []
        for var in vars_raw:
            var_text = var.group(0).strip()
            # Skip function/arrow function assignments which are handled separately
            if not re.search(r"=\s*function|\s*=>\s*", var_text):
                results.append(var_text)

        return results

    def _remove_comments(self, js: str) -> str:
        """Remove comments from JavaScript code to simplify parsing."""
        # Remove multi-line comments
        js = re.sub(r"/\*.*?\*/", "", js, flags=re.DOTALL)
        # Remove single-line comments
        js = re.sub(r"//.*?$", "", js, flags=re.MULTILINE)
        return js