msg_parser.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475
  1. # type: ignore
  2. from typing import AsyncGenerator
  3. from core.base.parsers.base_parser import AsyncParser
  4. from core.base.providers import (
  5. CompletionProvider,
  6. DatabaseProvider,
  7. IngestionConfig,
  8. )
  9. class MSGParser(AsyncParser[str | bytes]):
  10. """Parser for MSG (Outlook Message) files."""
  11. def __init__(
  12. self,
  13. config: IngestionConfig,
  14. database_provider: DatabaseProvider,
  15. llm_provider: CompletionProvider,
  16. ):
  17. self.database_provider = database_provider
  18. self.llm_provider = llm_provider
  19. self.config = config
  20. try:
  21. import extract_msg
  22. self.extract_msg = extract_msg
  23. except ImportError:
  24. raise ImportError(
  25. "Error: 'extract-msg' is required to run MSGParser. "
  26. "Please install it using pip: pip install extract-msg"
  27. )
  28. async def ingest(
  29. self, data: str | bytes, **kwargs
  30. ) -> AsyncGenerator[str, None]:
  31. """Ingest MSG data and yield email content."""
  32. if isinstance(data, str):
  33. raise ValueError("MSG data must be in bytes format.")
  34. from io import BytesIO
  35. file_obj = BytesIO(data)
  36. try:
  37. msg = self.extract_msg.Message(file_obj)
  38. # Extract metadata
  39. metadata = []
  40. if msg.subject:
  41. metadata.append(f"Subject: {msg.subject}")
  42. if msg.sender:
  43. metadata.append(f"From: {msg.sender}")
  44. if msg.to:
  45. metadata.append(f"To: {msg.to}")
  46. if msg.date:
  47. metadata.append(f"Date: {msg.date}")
  48. if metadata:
  49. yield "\n".join(metadata)
  50. # Extract body
  51. if msg.body:
  52. yield msg.body.strip()
  53. # Extract attachments (optional)
  54. for attachment in msg.attachments:
  55. if hasattr(attachment, "name"):
  56. yield f"\nAttachment: {attachment.name}"
  57. except Exception as e:
  58. raise ValueError(f"Error processing MSG file: {str(e)}")
  59. finally:
  60. file_obj.close()