123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234 |
- import pytest
- from unittest.mock import AsyncMock, MagicMock, patch, call
- from typing import Dict, List, Any, Optional
- # Skip all tests in this file for now as they need to be updated
- # to match the current Document and DocumentChunk implementations
- pytestmark = pytest.mark.skip("Document processing tests need to be updated to match current implementation")
- # Import necessary classes
- from core.base import Document, DocumentChunk
- @pytest.fixture
- def sample_document():
- """Return a sample document for testing."""
- return Document(
- document_id="doc-123",
- raw_text="Aristotle was a Greek philosopher who studied under Plato. He made significant contributions to logic, ethics, and metaphysics.",
- metadata={
- "source": "Philosophy Encyclopedia",
- "author": "Academic Press",
- "year": 2020,
- "document_type": "text"
- },
- chunks=[
- DocumentChunk(
- chunk_id="chunk-1",
- document_id="doc-123",
- text="Aristotle was a Greek philosopher who studied under Plato.",
- metadata={"section": "biography", "page": 1}
- ),
- DocumentChunk(
- chunk_id="chunk-2",
- document_id="doc-123",
- text="He made significant contributions to logic, ethics, and metaphysics.",
- metadata={"section": "contributions", "page": 1}
- )
- ]
- )
- @pytest.fixture
- def mock_document_handler():
- """Return a mock document handler."""
- handler = AsyncMock()
- handler.get_document_by_id = AsyncMock()
- handler.create_document = AsyncMock()
- handler.update_document = AsyncMock()
- handler.delete_document = AsyncMock()
- return handler
- @pytest.mark.asyncio
- async def test_document_chunking(mock_document_handler, sample_document):
- """Test document chunking functionality."""
- from core.main.services.documents import DocumentProcessingService
- # Setup the chunking service with mocked components
- service = DocumentProcessingService(document_handler=mock_document_handler)
- # Mock the chunking method
- original_chunk_method = service.chunk_document
- service.chunk_document = MagicMock(return_value=[
- DocumentChunk(
- chunk_id="new-chunk-1",
- document_id=sample_document.document_id,
- text="Aristotle was a Greek philosopher.",
- metadata={"auto_chunk": True}
- ),
- DocumentChunk(
- chunk_id="new-chunk-2",
- document_id=sample_document.document_id,
- text="He studied under Plato.",
- metadata={"auto_chunk": True}
- ),
- DocumentChunk(
- chunk_id="new-chunk-3",
- document_id=sample_document.document_id,
- text="He made significant contributions to logic, ethics, and metaphysics.",
- metadata={"auto_chunk": True}
- )
- ])
- # Process the document
- processed_doc = await service.process_document(sample_document)
- # Verify chunking was called
- service.chunk_document.assert_called_once()
- # Check that document was updated with new chunks
- assert len(processed_doc.chunks) == 3
- assert all(chunk.metadata.get("auto_chunk") for chunk in processed_doc.chunks)
- # Restore original method
- service.chunk_document = original_chunk_method
- @pytest.mark.asyncio
- async def test_document_metadata_extraction(mock_document_handler, sample_document):
- """Test metadata extraction from documents."""
- from core.main.services.documents import DocumentProcessingService
- # Setup the document processing service
- service = DocumentProcessingService(document_handler=mock_document_handler)
- # Mock metadata extraction
- original_extract_method = service.extract_metadata
- service.extract_metadata = MagicMock(return_value={
- "title": "Aristotle: Life and Works",
- "topics": ["philosophy", "logic", "ethics"],
- "sentiment": "neutral",
- "word_count": 24
- })
- # Process the document
- processed_doc = await service.process_document(sample_document, extract_metadata=True)
- # Verify metadata extraction was called
- service.extract_metadata.assert_called_once_with(sample_document.raw_text)
- # Check that document metadata was updated
- for key, value in service.extract_metadata.return_value.items():
- assert processed_doc.metadata.get(key) == value
- # Restore original method
- service.extract_metadata = original_extract_method
- @pytest.mark.asyncio
- async def test_document_embedding_generation(mock_document_handler, sample_document):
- """Test embedding generation for document chunks."""
- from core.main.services.documents import DocumentProcessingService
- # Setup mock embedding provider
- mock_embedding_provider = AsyncMock()
- mock_embedding_provider.async_get_embedding = AsyncMock(
- return_value=[0.1, 0.2, 0.3, 0.4]
- )
- # Setup document processing service
- service = DocumentProcessingService(
- document_handler=mock_document_handler,
- embedding_provider=mock_embedding_provider
- )
- # Process document with embedding generation
- processed_doc = await service.process_document(
- sample_document,
- generate_embeddings=True
- )
- # Verify embedding provider was called for each chunk
- assert mock_embedding_provider.async_get_embedding.call_count == len(sample_document.chunks)
- # Check that embeddings were stored with chunks
- for chunk in processed_doc.chunks:
- assert hasattr(chunk, "embedding")
- assert chunk.embedding == [0.1, 0.2, 0.3, 0.4]
- @pytest.mark.asyncio
- async def test_document_citation_processing(mock_document_handler, sample_document):
- """Test citation extraction and processing in documents."""
- from core.main.services.documents import DocumentProcessingService
- # Add citation markers to document text
- document_with_citations = Document(
- document_id="doc-456",
- raw_text="According to Smith [abc123], Aristotle developed formal logic. Jones [def456] argues that his ethics were revolutionary.",
- metadata={"source": "Academic Journal"}
- )
- # Setup document processing service
- service = DocumentProcessingService(document_handler=mock_document_handler)
- # Mock citation extraction method
- original_extract_citations = service.extract_citations
- service.extract_citations = MagicMock(return_value=[
- {"id": "abc123", "span": "According to Smith [abc123]", "start": 0, "end": 25},
- {"id": "def456", "span": "Jones [def456]", "start": 54, "end": 68}
- ])
- # Process document with citation extraction
- processed_doc = await service.process_document(
- document_with_citations,
- extract_citations=True
- )
- # Verify citation extraction was called
- service.extract_citations.assert_called_once_with(document_with_citations.raw_text)
- # Check that citations were stored with the document
- assert "citations" in processed_doc.metadata
- assert len(processed_doc.metadata["citations"]) == 2
- assert processed_doc.metadata["citations"][0]["id"] == "abc123"
- assert processed_doc.metadata["citations"][1]["id"] == "def456"
- # Restore original method
- service.extract_citations = original_extract_citations
- @pytest.mark.asyncio
- async def test_document_text_preprocessing(mock_document_handler):
- """Test text preprocessing for documents."""
- from core.main.services.documents import DocumentProcessingService
- # Setup document with formatting issues
- document_with_formatting = Document(
- document_id="doc-789",
- raw_text=" Aristotle was\n\na Greek\tphilosopher. He studied\nunder Plato. ",
- metadata={}
- )
- # Setup document processing service
- service = DocumentProcessingService(document_handler=mock_document_handler)
- # Mock text preprocessing method
- original_preprocess = service.preprocess_text
- service.preprocess_text = MagicMock(return_value="Aristotle was a Greek philosopher. He studied under Plato.")
- # Process document with preprocessing
- processed_doc = await service.process_document(
- document_with_formatting,
- preprocess_text=True
- )
- # Verify preprocessing was called
- service.preprocess_text.assert_called_once_with(document_with_formatting.raw_text)
- # Check that document text was preprocessed
- assert processed_doc.raw_text == "Aristotle was a Greek philosopher. He studied under Plato."
- # Restore original method
- service.preprocess_text = original_preprocess
|