nextcloud-mcp-server/tests/unit/test_document_chunker.py

"""Unit tests for DocumentChunker with position tracking."""

from nextcloud_mcp_server.vector.document_chunker import (
    ChunkWithPosition,
    DocumentChunker,
)


class TestDocumentChunkerPositions:
    """Test suite for DocumentChunker position tracking functionality."""

    def test_single_chunk_simple_text(self):
        """Test that single-chunk documents return correct positions."""
        chunker = DocumentChunker(chunk_size=512, overlap=50)
        content = "This is a short document."

        chunks = chunker.chunk_text(content)

        assert len(chunks) == 1
        assert isinstance(chunks[0], ChunkWithPosition)
        assert chunks[0].text == content
        assert chunks[0].start_offset == 0
        assert chunks[0].end_offset == len(content)

    def test_multiple_chunks_positions(self):
        """Test that multi-chunk documents have correct positions."""
        chunker = DocumentChunker(chunk_size=10, overlap=2)  # Small chunks for testing
        # Create content with exactly 30 words
        words = [f"word{i:02d}" for i in range(30)]
        content = " ".join(words)

        chunks = chunker.chunk_text(content)

        # Verify we got multiple chunks (30 words, 10 per chunk, 2 overlap = 4 chunks)
        assert len(chunks) == 4

        # Verify all chunks are ChunkWithPosition
        for chunk in chunks:
            assert isinstance(chunk, ChunkWithPosition)

        # Verify first chunk starts at 0
        assert chunks[0].start_offset == 0

        # Verify last chunk ends at content length
        assert chunks[-1].end_offset == len(content)

        # Verify chunks are contiguous or overlap (no gaps)
        for i in range(len(chunks) - 1):
            # Next chunk should start at or before current chunk ends
            assert chunks[i + 1].start_offset <= chunks[i].end_offset

        # Verify we can reconstruct the content using positions
        for chunk in chunks:
            extracted = content[chunk.start_offset : chunk.end_offset]
            assert extracted == chunk.text

    def test_chunk_positions_with_whitespace(self):
        """Test position tracking with various whitespace."""
        chunker = DocumentChunker(chunk_size=5, overlap=1)
        content = "word1  word2\n\nword3\tword4    word5 word6"

        chunks = chunker.chunk_text(content)

        # Verify positions correctly handle whitespace
        for chunk in chunks:
            extracted = content[chunk.start_offset : chunk.end_offset]
            assert extracted == chunk.text
            # Verify no leading/trailing whitespace unless in original
            if chunk != chunks[0] and chunk != chunks[-1]:
                # Middle chunks should be extracted correctly
                assert len(chunk.text.strip()) > 0

    def test_empty_content(self):
        """Test that empty content returns empty chunk."""
        chunker = DocumentChunker(chunk_size=512, overlap=50)
        content = ""

        chunks = chunker.chunk_text(content)

        assert len(chunks) == 1
        assert chunks[0].text == ""
        assert chunks[0].start_offset == 0
        assert chunks[0].end_offset == 0

    def test_chunk_overlap_positions(self):
        """Test that overlapping chunks have correct positions."""
        chunker = DocumentChunker(chunk_size=10, overlap=3)
        words = [f"word{i:02d}" for i in range(25)]
        content = " ".join(words)

        chunks = chunker.chunk_text(content)

        # Verify overlap exists
        for i in range(len(chunks) - 1):
            current_chunk = chunks[i]
            next_chunk = chunks[i + 1]

            # Next chunk should start before current ends (overlap)
            # This happens because we move back by overlap words
            # The actual character overlap depends on word lengths
            assert next_chunk.start_offset >= 0
            assert current_chunk.end_offset <= len(content)

    def test_unicode_content_positions(self):
        """Test position tracking with Unicode characters."""
        chunker = DocumentChunker(chunk_size=10, overlap=2)
        content = "Hello 世界 こんにちは мир Привет שלום مرحبا 你好"

        chunks = chunker.chunk_text(content)

        # Verify all chunks extract correctly
        for chunk in chunks:
            extracted = content[chunk.start_offset : chunk.end_offset]
            assert extracted == chunk.text

        # Verify full coverage
        if len(chunks) == 1:
            assert chunks[0].start_offset == 0
            assert chunks[0].end_offset == len(content)

    def test_single_word_chunks(self):
        """Test position tracking with single-word chunks."""
        chunker = DocumentChunker(chunk_size=1, overlap=0)
        content = "one two three"

        chunks = chunker.chunk_text(content)

        assert len(chunks) == 3
        assert chunks[0].text == "one"
        assert chunks[1].text == "two"
        assert chunks[2].text == "three"

        # Verify positions
        assert content[chunks[0].start_offset : chunks[0].end_offset] == "one"
        assert content[chunks[1].start_offset : chunks[1].end_offset] == "two"
        assert content[chunks[2].start_offset : chunks[2].end_offset] == "three"

    def test_realistic_note_content(self):
        """Test with realistic note content similar to Nextcloud Notes."""
        chunker = DocumentChunker(chunk_size=50, overlap=10)
        content = """My Project Notes

This is a note about my project. It contains several paragraphs of text
that should be chunked appropriately for embedding.

## Key Points

- First important point with some details
- Second point that needs to be remembered
- Third point for future reference

The document continues with more content here. We want to make sure that
the chunking preserves context across boundaries while maintaining proper
position tracking for each chunk.

This allows us to highlight the exact chunk that matched a search query,
which builds trust in the RAG system."""

        chunks = chunker.chunk_text(content)

        # Should have multiple chunks
        assert len(chunks) > 1

        # Verify all chunks
        for chunk in chunks:
            assert isinstance(chunk, ChunkWithPosition)
            # Verify extraction
            extracted = content[chunk.start_offset : chunk.end_offset]
            assert extracted == chunk.text
            # Verify positions are valid
            assert chunk.start_offset >= 0
            assert chunk.end_offset <= len(content)
            assert chunk.start_offset < chunk.end_offset

    def test_chunk_boundaries(self):
        """Test that chunk boundaries are word-aligned."""
        chunker = DocumentChunker(chunk_size=10, overlap=2)
        words = [f"word{i:02d}" for i in range(30)]
        content = " ".join(words)

        chunks = chunker.chunk_text(content)

        for chunk in chunks:
            # Verify chunk text starts and ends with word characters (no split words)
            # Unless it's the full content
            if len(chunks) > 1:
                # Each chunk should start with a word (not whitespace)
                assert chunk.text[0].strip() != ""
                # Each chunk should end with a word (not whitespace)
                assert chunk.text[-1].strip() != ""