test_document_processing.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. import pytest
  2. from unittest.mock import AsyncMock, MagicMock, patch, call
  3. from typing import Dict, List, Any, Optional
  4. # Skip all tests in this file for now as they need to be updated
  5. # to match the current Document and DocumentChunk implementations
  6. pytestmark = pytest.mark.skip("Document processing tests need to be updated to match current implementation")
  7. # Import necessary classes
  8. from core.base import Document, DocumentChunk
  9. @pytest.fixture
  10. def sample_document():
  11. """Return a sample document for testing."""
  12. return Document(
  13. document_id="doc-123",
  14. raw_text="Aristotle was a Greek philosopher who studied under Plato. He made significant contributions to logic, ethics, and metaphysics.",
  15. metadata={
  16. "source": "Philosophy Encyclopedia",
  17. "author": "Academic Press",
  18. "year": 2020,
  19. "document_type": "text"
  20. },
  21. chunks=[
  22. DocumentChunk(
  23. chunk_id="chunk-1",
  24. document_id="doc-123",
  25. text="Aristotle was a Greek philosopher who studied under Plato.",
  26. metadata={"section": "biography", "page": 1}
  27. ),
  28. DocumentChunk(
  29. chunk_id="chunk-2",
  30. document_id="doc-123",
  31. text="He made significant contributions to logic, ethics, and metaphysics.",
  32. metadata={"section": "contributions", "page": 1}
  33. )
  34. ]
  35. )
  36. @pytest.fixture
  37. def mock_document_handler():
  38. """Return a mock document handler."""
  39. handler = AsyncMock()
  40. handler.get_document_by_id = AsyncMock()
  41. handler.create_document = AsyncMock()
  42. handler.update_document = AsyncMock()
  43. handler.delete_document = AsyncMock()
  44. return handler
  45. @pytest.mark.asyncio
  46. async def test_document_chunking(mock_document_handler, sample_document):
  47. """Test document chunking functionality."""
  48. from core.main.services.documents import DocumentProcessingService
  49. # Setup the chunking service with mocked components
  50. service = DocumentProcessingService(document_handler=mock_document_handler)
  51. # Mock the chunking method
  52. original_chunk_method = service.chunk_document
  53. service.chunk_document = MagicMock(return_value=[
  54. DocumentChunk(
  55. chunk_id="new-chunk-1",
  56. document_id=sample_document.document_id,
  57. text="Aristotle was a Greek philosopher.",
  58. metadata={"auto_chunk": True}
  59. ),
  60. DocumentChunk(
  61. chunk_id="new-chunk-2",
  62. document_id=sample_document.document_id,
  63. text="He studied under Plato.",
  64. metadata={"auto_chunk": True}
  65. ),
  66. DocumentChunk(
  67. chunk_id="new-chunk-3",
  68. document_id=sample_document.document_id,
  69. text="He made significant contributions to logic, ethics, and metaphysics.",
  70. metadata={"auto_chunk": True}
  71. )
  72. ])
  73. # Process the document
  74. processed_doc = await service.process_document(sample_document)
  75. # Verify chunking was called
  76. service.chunk_document.assert_called_once()
  77. # Check that document was updated with new chunks
  78. assert len(processed_doc.chunks) == 3
  79. assert all(chunk.metadata.get("auto_chunk") for chunk in processed_doc.chunks)
  80. # Restore original method
  81. service.chunk_document = original_chunk_method
  82. @pytest.mark.asyncio
  83. async def test_document_metadata_extraction(mock_document_handler, sample_document):
  84. """Test metadata extraction from documents."""
  85. from core.main.services.documents import DocumentProcessingService
  86. # Setup the document processing service
  87. service = DocumentProcessingService(document_handler=mock_document_handler)
  88. # Mock metadata extraction
  89. original_extract_method = service.extract_metadata
  90. service.extract_metadata = MagicMock(return_value={
  91. "title": "Aristotle: Life and Works",
  92. "topics": ["philosophy", "logic", "ethics"],
  93. "sentiment": "neutral",
  94. "word_count": 24
  95. })
  96. # Process the document
  97. processed_doc = await service.process_document(sample_document, extract_metadata=True)
  98. # Verify metadata extraction was called
  99. service.extract_metadata.assert_called_once_with(sample_document.raw_text)
  100. # Check that document metadata was updated
  101. for key, value in service.extract_metadata.return_value.items():
  102. assert processed_doc.metadata.get(key) == value
  103. # Restore original method
  104. service.extract_metadata = original_extract_method
  105. @pytest.mark.asyncio
  106. async def test_document_embedding_generation(mock_document_handler, sample_document):
  107. """Test embedding generation for document chunks."""
  108. from core.main.services.documents import DocumentProcessingService
  109. # Setup mock embedding provider
  110. mock_embedding_provider = AsyncMock()
  111. mock_embedding_provider.async_get_embedding = AsyncMock(
  112. return_value=[0.1, 0.2, 0.3, 0.4]
  113. )
  114. # Setup document processing service
  115. service = DocumentProcessingService(
  116. document_handler=mock_document_handler,
  117. embedding_provider=mock_embedding_provider
  118. )
  119. # Process document with embedding generation
  120. processed_doc = await service.process_document(
  121. sample_document,
  122. generate_embeddings=True
  123. )
  124. # Verify embedding provider was called for each chunk
  125. assert mock_embedding_provider.async_get_embedding.call_count == len(sample_document.chunks)
  126. # Check that embeddings were stored with chunks
  127. for chunk in processed_doc.chunks:
  128. assert hasattr(chunk, "embedding")
  129. assert chunk.embedding == [0.1, 0.2, 0.3, 0.4]
  130. @pytest.mark.asyncio
  131. async def test_document_citation_processing(mock_document_handler, sample_document):
  132. """Test citation extraction and processing in documents."""
  133. from core.main.services.documents import DocumentProcessingService
  134. # Add citation markers to document text
  135. document_with_citations = Document(
  136. document_id="doc-456",
  137. raw_text="According to Smith [abc123], Aristotle developed formal logic. Jones [def456] argues that his ethics were revolutionary.",
  138. metadata={"source": "Academic Journal"}
  139. )
  140. # Setup document processing service
  141. service = DocumentProcessingService(document_handler=mock_document_handler)
  142. # Mock citation extraction method
  143. original_extract_citations = service.extract_citations
  144. service.extract_citations = MagicMock(return_value=[
  145. {"id": "abc123", "span": "According to Smith [abc123]", "start": 0, "end": 25},
  146. {"id": "def456", "span": "Jones [def456]", "start": 54, "end": 68}
  147. ])
  148. # Process document with citation extraction
  149. processed_doc = await service.process_document(
  150. document_with_citations,
  151. extract_citations=True
  152. )
  153. # Verify citation extraction was called
  154. service.extract_citations.assert_called_once_with(document_with_citations.raw_text)
  155. # Check that citations were stored with the document
  156. assert "citations" in processed_doc.metadata
  157. assert len(processed_doc.metadata["citations"]) == 2
  158. assert processed_doc.metadata["citations"][0]["id"] == "abc123"
  159. assert processed_doc.metadata["citations"][1]["id"] == "def456"
  160. # Restore original method
  161. service.extract_citations = original_extract_citations
  162. @pytest.mark.asyncio
  163. async def test_document_text_preprocessing(mock_document_handler):
  164. """Test text preprocessing for documents."""
  165. from core.main.services.documents import DocumentProcessingService
  166. # Setup document with formatting issues
  167. document_with_formatting = Document(
  168. document_id="doc-789",
  169. raw_text=" Aristotle was\n\na Greek\tphilosopher. He studied\nunder Plato. ",
  170. metadata={}
  171. )
  172. # Setup document processing service
  173. service = DocumentProcessingService(document_handler=mock_document_handler)
  174. # Mock text preprocessing method
  175. original_preprocess = service.preprocess_text
  176. service.preprocess_text = MagicMock(return_value="Aristotle was a Greek philosopher. He studied under Plato.")
  177. # Process document with preprocessing
  178. processed_doc = await service.process_document(
  179. document_with_formatting,
  180. preprocess_text=True
  181. )
  182. # Verify preprocessing was called
  183. service.preprocess_text.assert_called_once_with(document_with_formatting.raw_text)
  184. # Check that document text was preprocessed
  185. assert processed_doc.raw_text == "Aristotle was a Greek philosopher. He studied under Plato."
  186. # Restore original method
  187. service.preprocess_text = original_preprocess