epub_parser.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. # type: ignore
  2. import logging
  3. from typing import AsyncGenerator
  4. import epub
  5. from core.base.parsers.base_parser import AsyncParser
  6. from core.base.providers import (
  7. CompletionProvider,
  8. DatabaseProvider,
  9. IngestionConfig,
  10. )
  11. logger = logging.getLogger(__name__)
  12. class EPUBParser(AsyncParser[str | bytes]):
  13. """Parser for EPUB electronic book files."""
  14. def __init__(
  15. self,
  16. config: IngestionConfig,
  17. database_provider: DatabaseProvider,
  18. llm_provider: CompletionProvider,
  19. ):
  20. self.database_provider = database_provider
  21. self.llm_provider = llm_provider
  22. self.config = config
  23. self.epub = epub
  24. def _safe_get_metadata(self, book, field: str) -> str | None:
  25. """Safely extract metadata field from epub book."""
  26. try:
  27. return getattr(book, field, None) or getattr(book.opf, field, None)
  28. except Exception as e:
  29. logger.debug(f"Error getting {field} metadata: {e}")
  30. return None
  31. def _clean_text(self, content: bytes) -> str:
  32. """Clean HTML content and return plain text."""
  33. try:
  34. import re
  35. text = content.decode("utf-8", errors="ignore")
  36. # Remove HTML tags
  37. text = re.sub(r"<[^>]+>", " ", text)
  38. # Normalize whitespace
  39. text = re.sub(r"\s+", " ", text)
  40. # Remove any remaining HTML entities
  41. text = re.sub(r"&[^;]+;", " ", text)
  42. return text.strip()
  43. except Exception as e:
  44. logger.warning(f"Error cleaning text: {e}")
  45. return ""
  46. async def ingest(
  47. self, data: str | bytes, **kwargs
  48. ) -> AsyncGenerator[str, None]:
  49. """Ingest EPUB data and yield book content."""
  50. if isinstance(data, str):
  51. raise ValueError("EPUB data must be in bytes format.")
  52. from io import BytesIO
  53. file_obj = BytesIO(data)
  54. try:
  55. book = self.epub.open_epub(file_obj)
  56. # Safely extract metadata
  57. metadata = []
  58. for field, label in [
  59. ("title", "Title"),
  60. ("creator", "Author"),
  61. ("language", "Language"),
  62. ("publisher", "Publisher"),
  63. ("date", "Date"),
  64. ]:
  65. if value := self._safe_get_metadata(book, field):
  66. metadata.append(f"{label}: {value}")
  67. if metadata:
  68. yield "\n".join(metadata)
  69. # Extract content from items
  70. try:
  71. manifest = getattr(book.opf, "manifest", {}) or {}
  72. for item in manifest.values():
  73. try:
  74. if (
  75. getattr(item, "mime_type", "")
  76. == "application/xhtml+xml"
  77. ):
  78. if content := book.read_item(item):
  79. if cleaned_text := self._clean_text(content):
  80. yield cleaned_text
  81. except Exception as e:
  82. logger.warning(f"Error processing item: {e}")
  83. continue
  84. except Exception as e:
  85. logger.warning(f"Error accessing manifest: {e}")
  86. # Fallback: try to get content directly
  87. if hasattr(book, "read_item"):
  88. for item_id in getattr(book, "items", []):
  89. try:
  90. if content := book.read_item(item_id):
  91. if cleaned_text := self._clean_text(content):
  92. yield cleaned_text
  93. except Exception as e:
  94. logger.warning(f"Error in fallback reading: {e}")
  95. continue
  96. except Exception as e:
  97. logger.error(f"Error processing EPUB file: {str(e)}")
  98. raise ValueError(f"Error processing EPUB file: {str(e)}") from e
  99. finally:
  100. try:
  101. file_obj.close()
  102. except Exception as e:
  103. logger.warning(f"Error closing file: {e}")