org_parser.py 2.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. # type: ignore
  2. from typing import AsyncGenerator
  3. import orgparse
  4. from core.base.parsers.base_parser import AsyncParser
  5. from core.base.providers import (
  6. CompletionProvider,
  7. DatabaseProvider,
  8. IngestionConfig,
  9. )
  10. class ORGParser(AsyncParser[str | bytes]):
  11. """Parser for ORG (Emacs Org-mode) files."""
  12. def __init__(
  13. self,
  14. config: IngestionConfig,
  15. database_provider: DatabaseProvider,
  16. llm_provider: CompletionProvider,
  17. ):
  18. self.database_provider = database_provider
  19. self.llm_provider = llm_provider
  20. self.config = config
  21. self.orgparse = orgparse
  22. def _process_node(self, node) -> list[str]:
  23. """Process an org-mode node and return its content."""
  24. contents = []
  25. # Add heading with proper level of asterisks
  26. if node.level > 0:
  27. contents.append(f"{'*' * node.level} {node.heading}")
  28. # Add body content if exists
  29. if node.body:
  30. contents.append(node.body.strip())
  31. return contents
  32. async def ingest(
  33. self, data: str | bytes, **kwargs
  34. ) -> AsyncGenerator[str, None]:
  35. """Ingest ORG data and yield document content."""
  36. if isinstance(data, bytes):
  37. data = data.decode("utf-8")
  38. try:
  39. # Create a temporary file-like object for orgparse
  40. from io import StringIO
  41. file_obj = StringIO(data)
  42. # Parse the org file
  43. root = self.orgparse.load(file_obj)
  44. # Process root node if it has content
  45. if root.body:
  46. yield root.body.strip()
  47. # Process all nodes
  48. for node in root[1:]: # Skip root node in iteration
  49. contents = self._process_node(node)
  50. for content in contents:
  51. if content.strip():
  52. yield content.strip()
  53. except Exception as e:
  54. raise ValueError(f"Error processing ORG file: {str(e)}") from e
  55. finally:
  56. file_obj.close()