862308418e
Fixed a critical infinite loop bug in document_chunker.py that occurred when the overlap parameter caused the chunker to not make forward progress. Changes: - Added ChunkWithPosition dataclass to track character positions - Refactored chunk_text() to use regex word matching for accurate position tracking - Added safety check to ensure forward progress (next_start_idx > start_idx) - Changed return type from list[str] to list[ChunkWithPosition] The bug manifested when: 1. end_idx reached len(word_matches) (processing last chunk) 2. next_start_idx = end_idx - overlap would not advance past start_idx 3. Loop would continue indefinitely without making progress Fix ensures chunker always terminates by breaking when not advancing. All 9 unit tests now pass in 1.66s (previously timing out at 180s). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
191 lines
7.1 KiB
Python
191 lines
7.1 KiB
Python
"""Unit tests for DocumentChunker with position tracking."""
|
|
|
|
from nextcloud_mcp_server.vector.document_chunker import (
|
|
ChunkWithPosition,
|
|
DocumentChunker,
|
|
)
|
|
|
|
|
|
class TestDocumentChunkerPositions:
|
|
"""Test suite for DocumentChunker position tracking functionality."""
|
|
|
|
def test_single_chunk_simple_text(self):
|
|
"""Test that single-chunk documents return correct positions."""
|
|
chunker = DocumentChunker(chunk_size=512, overlap=50)
|
|
content = "This is a short document."
|
|
|
|
chunks = chunker.chunk_text(content)
|
|
|
|
assert len(chunks) == 1
|
|
assert isinstance(chunks[0], ChunkWithPosition)
|
|
assert chunks[0].text == content
|
|
assert chunks[0].start_offset == 0
|
|
assert chunks[0].end_offset == len(content)
|
|
|
|
def test_multiple_chunks_positions(self):
|
|
"""Test that multi-chunk documents have correct positions."""
|
|
chunker = DocumentChunker(chunk_size=10, overlap=2) # Small chunks for testing
|
|
# Create content with exactly 30 words
|
|
words = [f"word{i:02d}" for i in range(30)]
|
|
content = " ".join(words)
|
|
|
|
chunks = chunker.chunk_text(content)
|
|
|
|
# Verify we got multiple chunks (30 words, 10 per chunk, 2 overlap = 4 chunks)
|
|
assert len(chunks) == 4
|
|
|
|
# Verify all chunks are ChunkWithPosition
|
|
for chunk in chunks:
|
|
assert isinstance(chunk, ChunkWithPosition)
|
|
|
|
# Verify first chunk starts at 0
|
|
assert chunks[0].start_offset == 0
|
|
|
|
# Verify last chunk ends at content length
|
|
assert chunks[-1].end_offset == len(content)
|
|
|
|
# Verify chunks are contiguous or overlap (no gaps)
|
|
for i in range(len(chunks) - 1):
|
|
# Next chunk should start at or before current chunk ends
|
|
assert chunks[i + 1].start_offset <= chunks[i].end_offset
|
|
|
|
# Verify we can reconstruct the content using positions
|
|
for chunk in chunks:
|
|
extracted = content[chunk.start_offset : chunk.end_offset]
|
|
assert extracted == chunk.text
|
|
|
|
def test_chunk_positions_with_whitespace(self):
|
|
"""Test position tracking with various whitespace."""
|
|
chunker = DocumentChunker(chunk_size=5, overlap=1)
|
|
content = "word1 word2\n\nword3\tword4 word5 word6"
|
|
|
|
chunks = chunker.chunk_text(content)
|
|
|
|
# Verify positions correctly handle whitespace
|
|
for chunk in chunks:
|
|
extracted = content[chunk.start_offset : chunk.end_offset]
|
|
assert extracted == chunk.text
|
|
# Verify no leading/trailing whitespace unless in original
|
|
if chunk != chunks[0] and chunk != chunks[-1]:
|
|
# Middle chunks should be extracted correctly
|
|
assert len(chunk.text.strip()) > 0
|
|
|
|
def test_empty_content(self):
|
|
"""Test that empty content returns empty chunk."""
|
|
chunker = DocumentChunker(chunk_size=512, overlap=50)
|
|
content = ""
|
|
|
|
chunks = chunker.chunk_text(content)
|
|
|
|
assert len(chunks) == 1
|
|
assert chunks[0].text == ""
|
|
assert chunks[0].start_offset == 0
|
|
assert chunks[0].end_offset == 0
|
|
|
|
def test_chunk_overlap_positions(self):
|
|
"""Test that overlapping chunks have correct positions."""
|
|
chunker = DocumentChunker(chunk_size=10, overlap=3)
|
|
words = [f"word{i:02d}" for i in range(25)]
|
|
content = " ".join(words)
|
|
|
|
chunks = chunker.chunk_text(content)
|
|
|
|
# Verify overlap exists
|
|
for i in range(len(chunks) - 1):
|
|
current_chunk = chunks[i]
|
|
next_chunk = chunks[i + 1]
|
|
|
|
# Next chunk should start before current ends (overlap)
|
|
# This happens because we move back by overlap words
|
|
# The actual character overlap depends on word lengths
|
|
assert next_chunk.start_offset >= 0
|
|
assert current_chunk.end_offset <= len(content)
|
|
|
|
def test_unicode_content_positions(self):
|
|
"""Test position tracking with Unicode characters."""
|
|
chunker = DocumentChunker(chunk_size=10, overlap=2)
|
|
content = "Hello 世界 こんにちは мир Привет שלום مرحبا 你好"
|
|
|
|
chunks = chunker.chunk_text(content)
|
|
|
|
# Verify all chunks extract correctly
|
|
for chunk in chunks:
|
|
extracted = content[chunk.start_offset : chunk.end_offset]
|
|
assert extracted == chunk.text
|
|
|
|
# Verify full coverage
|
|
if len(chunks) == 1:
|
|
assert chunks[0].start_offset == 0
|
|
assert chunks[0].end_offset == len(content)
|
|
|
|
def test_single_word_chunks(self):
|
|
"""Test position tracking with single-word chunks."""
|
|
chunker = DocumentChunker(chunk_size=1, overlap=0)
|
|
content = "one two three"
|
|
|
|
chunks = chunker.chunk_text(content)
|
|
|
|
assert len(chunks) == 3
|
|
assert chunks[0].text == "one"
|
|
assert chunks[1].text == "two"
|
|
assert chunks[2].text == "three"
|
|
|
|
# Verify positions
|
|
assert content[chunks[0].start_offset : chunks[0].end_offset] == "one"
|
|
assert content[chunks[1].start_offset : chunks[1].end_offset] == "two"
|
|
assert content[chunks[2].start_offset : chunks[2].end_offset] == "three"
|
|
|
|
def test_realistic_note_content(self):
|
|
"""Test with realistic note content similar to Nextcloud Notes."""
|
|
chunker = DocumentChunker(chunk_size=50, overlap=10)
|
|
content = """My Project Notes
|
|
|
|
This is a note about my project. It contains several paragraphs of text
|
|
that should be chunked appropriately for embedding.
|
|
|
|
## Key Points
|
|
|
|
- First important point with some details
|
|
- Second point that needs to be remembered
|
|
- Third point for future reference
|
|
|
|
The document continues with more content here. We want to make sure that
|
|
the chunking preserves context across boundaries while maintaining proper
|
|
position tracking for each chunk.
|
|
|
|
This allows us to highlight the exact chunk that matched a search query,
|
|
which builds trust in the RAG system."""
|
|
|
|
chunks = chunker.chunk_text(content)
|
|
|
|
# Should have multiple chunks
|
|
assert len(chunks) > 1
|
|
|
|
# Verify all chunks
|
|
for chunk in chunks:
|
|
assert isinstance(chunk, ChunkWithPosition)
|
|
# Verify extraction
|
|
extracted = content[chunk.start_offset : chunk.end_offset]
|
|
assert extracted == chunk.text
|
|
# Verify positions are valid
|
|
assert chunk.start_offset >= 0
|
|
assert chunk.end_offset <= len(content)
|
|
assert chunk.start_offset < chunk.end_offset
|
|
|
|
def test_chunk_boundaries(self):
|
|
"""Test that chunk boundaries are word-aligned."""
|
|
chunker = DocumentChunker(chunk_size=10, overlap=2)
|
|
words = [f"word{i:02d}" for i in range(30)]
|
|
content = " ".join(words)
|
|
|
|
chunks = chunker.chunk_text(content)
|
|
|
|
for chunk in chunks:
|
|
# Verify chunk text starts and ends with word characters (no split words)
|
|
# Unless it's the full content
|
|
if len(chunks) > 1:
|
|
# Each chunk should start with a word (not whitespace)
|
|
assert chunk.text[0].strip() != ""
|
|
# Each chunk should end with a word (not whitespace)
|
|
assert chunk.text[-1].strip() != ""
|