css_parser.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. # type: ignore
  2. import re
  3. from typing import AsyncGenerator
  4. from core.base.parsers.base_parser import AsyncParser
  5. from core.base.providers import (
  6. CompletionProvider,
  7. DatabaseProvider,
  8. IngestionConfig,
  9. )
  10. class CSSParser(AsyncParser[str | bytes]):
  11. """A parser for CSS files."""
  12. def __init__(
  13. self,
  14. config: IngestionConfig,
  15. database_provider: DatabaseProvider,
  16. llm_provider: CompletionProvider,
  17. ):
  18. self.database_provider = database_provider
  19. self.llm_provider = llm_provider
  20. self.config = config
  21. async def ingest(
  22. self, data: str | bytes, *args, **kwargs
  23. ) -> AsyncGenerator[str, None]:
  24. """Ingest CSS data and yield structured text representation.
  25. Extracts selectors, properties, values, and comments from CSS while
  26. preserving the structure in a text format suitable for analysis.
  27. :param data: The CSS content to parse
  28. :param kwargs: Additional keyword arguments
  29. """
  30. if isinstance(data, bytes):
  31. data = data.decode("utf-8", errors="ignore")
  32. # Process the CSS content
  33. processed_text = self._process_css_content(data)
  34. # Yield the processed text
  35. yield processed_text
  36. def _process_css_content(self, css: str) -> str:
  37. """Process CSS content into a structured text representation.
  38. This method:
  39. 1. Extracts and preserves comments
  40. 2. Identifies selectors and their properties
  41. 3. Formats the CSS structure in a readable way
  42. """
  43. # Extract comments
  44. comments = self._extract_comments(css)
  45. # Extract rules (selectors and declarations)
  46. rules = self._extract_rules(css)
  47. # Build the result
  48. result = []
  49. if comments:
  50. result.append("COMMENTS:")
  51. result.extend(comments)
  52. result.append("")
  53. if rules:
  54. result.append("CSS RULES:")
  55. result.extend(rules)
  56. return "\n".join(result)
  57. def _extract_comments(self, css: str) -> list[str]:
  58. """Extract comments from CSS content."""
  59. comment_pattern = r"/\*(.*?)\*/"
  60. comments = re.findall(comment_pattern, css, re.DOTALL)
  61. return [comment.strip() for comment in comments if comment.strip()]
  62. def _extract_rules(self, css: str) -> list[str]:
  63. """Extract selectors and their declarations from CSS content."""
  64. # Remove comments to simplify parsing
  65. css_without_comments = re.sub(r"/\*.*?\*/", "", css, flags=re.DOTALL)
  66. # Pattern to match CSS rules
  67. rule_pattern = r"([^{]+)\{([^}]*)\}"
  68. matches = re.findall(rule_pattern, css_without_comments)
  69. rules = []
  70. for selector, declarations in matches:
  71. selector = selector.strip()
  72. if not selector:
  73. continue
  74. rules.append(f"Selector: {selector}")
  75. # Process declarations
  76. declaration_list = declarations.strip().split(";")
  77. for declaration in declaration_list:
  78. declaration = declaration.strip()
  79. if declaration:
  80. property_value = declaration.split(":", 1)
  81. if len(property_value) == 2:
  82. property_name = property_value[0].strip()
  83. value = property_value[1].strip()
  84. rules.append(f" {property_name}: {value}")
  85. rules.append("") # Empty line for readability
  86. return rules