eml_parser.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. # type: ignore
  2. from email import message_from_bytes, policy
  3. from typing import AsyncGenerator
  4. from core.base.parsers.base_parser import AsyncParser
  5. from core.base.providers import (
  6. CompletionProvider,
  7. DatabaseProvider,
  8. IngestionConfig,
  9. )
  10. class EMLParser(AsyncParser[str | bytes]):
  11. """Parser for EML (email) files."""
  12. def __init__(
  13. self,
  14. config: IngestionConfig,
  15. database_provider: DatabaseProvider,
  16. llm_provider: CompletionProvider,
  17. ):
  18. self.database_provider = database_provider
  19. self.llm_provider = llm_provider
  20. self.config = config
  21. async def ingest(
  22. self, data: str | bytes, **kwargs
  23. ) -> AsyncGenerator[str, None]:
  24. """Ingest EML data and yield email content."""
  25. if isinstance(data, str):
  26. raise ValueError("EML data must be in bytes format.")
  27. # Parse email with policy for modern email handling
  28. email_message = message_from_bytes(data, policy=policy.default)
  29. # Extract and yield email metadata
  30. metadata = []
  31. if email_message["Subject"]:
  32. metadata.append(f"Subject: {email_message['Subject']}")
  33. if email_message["From"]:
  34. metadata.append(f"From: {email_message['From']}")
  35. if email_message["To"]:
  36. metadata.append(f"To: {email_message['To']}")
  37. if email_message["Date"]:
  38. metadata.append(f"Date: {email_message['Date']}")
  39. if metadata:
  40. yield "\n".join(metadata)
  41. # Extract and yield email body
  42. if email_message.is_multipart():
  43. for part in email_message.walk():
  44. if part.get_content_type() == "text/plain":
  45. text = part.get_content()
  46. if text.strip():
  47. yield text.strip()
  48. elif part.get_content_type() == "text/html":
  49. # Could add HTML parsing here if needed
  50. continue
  51. else:
  52. body = email_message.get_content()
  53. if body.strip():
  54. yield body.strip()