feat(vector): Add configurable chunk size and overlap for document embedding
Enable users to tune document chunking parameters to match their embedding model and content type by adding DOCUMENT_CHUNK_SIZE and DOCUMENT_CHUNK_OVERLAP environment variables. - **config.py**: Added `document_chunk_size` (default: 512) and `document_chunk_overlap` (default: 50) configuration fields with validation: - Ensures overlap < chunk_size - Warns if chunk_size < 100 words - Prevents negative overlap values - **processor.py**: Updated DocumentChunker instantiation to use config settings instead of hardcoded values (line 174-177) - **tests/unit/test_config.py**: Added TestChunkConfigValidation class with 9 tests covering: - Default values - Valid configurations - Validation errors (overlap >= chunk_size, negative overlap) - Warning for small chunk sizes - Environment variable loading - **docs/configuration.md**: Added comprehensive "Document Chunking Configuration" section with: - Chunk size selection guidance (256-384 vs 512 vs 768-1024 words) - Overlap recommendations (10-20% of chunk size) - Configuration examples for different use cases - Added env vars to reference table - **docs/semantic-search-architecture.md**: Added "Document Chunking Strategy" section with: - Chunking process explanation - Example showing sliding window behavior - Search behavior with chunks - Tuning recommendations - **env.sample**: Added complete "Semantic Search & Vector Sync Configuration" section with: - Vector sync settings - Qdrant configuration (3 modes) - Ollama embedding service - Document chunking configuration - **docker-compose.yml**: Added commented examples for DOCUMENT_CHUNK_SIZE and DOCUMENT_CHUNK_OVERLAP with usage notes \`\`\`bash DOCUMENT_CHUNK_SIZE=512 DOCUMENT_CHUNK_OVERLAP=50 \`\`\` 1. \`overlap\` must be less than \`chunk_size\` 2. \`overlap\` cannot be negative 3. Warning issued if \`chunk_size\` < 100 words **Precise matching** (small notes, specific queries): \`\`\`bash DOCUMENT_CHUNK_SIZE=256 DOCUMENT_CHUNK_OVERLAP=25 \`\`\` **Balanced** (default, general purpose): \`\`\`bash DOCUMENT_CHUNK_SIZE=512 DOCUMENT_CHUNK_OVERLAP=50 \`\`\` **Contextual** (long documents, broader topics): \`\`\`bash DOCUMENT_CHUNK_SIZE=1024 DOCUMENT_CHUNK_OVERLAP=100 \`\`\` ✅ **User control** - Tune chunking to match embedding model capabilities ✅ **Experimentation** - Test different chunk sizes for optimal results ✅ **Model alignment** - Match chunk size to embedding context window ✅ **Backward compatible** - Defaults maintain existing behavior ✅ **Well validated** - Comprehensive tests prevent misconfiguration All 22 config validation tests pass (9 new tests for chunking): - Default values work correctly - Validation prevents invalid configurations - Environment variables load properly - Warning system works as expected With configurable chunk sizes, users can now experiment with different Ollama embedding models and tune chunk parameters for optimal semantic search quality. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -151,3 +151,111 @@ class TestGetSettings:
|
||||
assert settings.vector_sync_scan_interval == 600
|
||||
assert settings.vector_sync_processor_workers == 5
|
||||
assert settings.vector_sync_queue_max_size == 5000
|
||||
|
||||
|
||||
class TestChunkConfigValidation:
|
||||
"""Test document chunking configuration validation."""
|
||||
|
||||
def test_default_chunk_settings(self):
|
||||
"""Test default chunk size and overlap values."""
|
||||
settings = Settings()
|
||||
assert settings.document_chunk_size == 512
|
||||
assert settings.document_chunk_overlap == 50
|
||||
|
||||
def test_valid_chunk_settings(self):
|
||||
"""Test valid chunk size and overlap configuration."""
|
||||
settings = Settings(
|
||||
document_chunk_size=1024,
|
||||
document_chunk_overlap=100,
|
||||
)
|
||||
assert settings.document_chunk_size == 1024
|
||||
assert settings.document_chunk_overlap == 100
|
||||
|
||||
def test_overlap_greater_than_or_equal_to_chunk_size_raises_error(self):
|
||||
"""Test that overlap >= chunk size raises ValueError."""
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="DOCUMENT_CHUNK_OVERLAP .* must be less than DOCUMENT_CHUNK_SIZE",
|
||||
):
|
||||
Settings(
|
||||
document_chunk_size=512,
|
||||
document_chunk_overlap=512,
|
||||
)
|
||||
|
||||
def test_overlap_larger_than_chunk_size_raises_error(self):
|
||||
"""Test that overlap > chunk size raises ValueError."""
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="DOCUMENT_CHUNK_OVERLAP .* must be less than DOCUMENT_CHUNK_SIZE",
|
||||
):
|
||||
Settings(
|
||||
document_chunk_size=256,
|
||||
document_chunk_overlap=300,
|
||||
)
|
||||
|
||||
def test_negative_overlap_raises_error(self):
|
||||
"""Test that negative overlap raises ValueError."""
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="DOCUMENT_CHUNK_OVERLAP .* cannot be negative",
|
||||
):
|
||||
Settings(
|
||||
document_chunk_size=512,
|
||||
document_chunk_overlap=-10,
|
||||
)
|
||||
|
||||
def test_small_chunk_size_warning(self, caplog):
|
||||
"""Test that chunk size < 100 triggers warning."""
|
||||
import logging
|
||||
|
||||
caplog.set_level(logging.WARNING, logger="nextcloud_mcp_server.config")
|
||||
Settings(
|
||||
document_chunk_size=64,
|
||||
document_chunk_overlap=10,
|
||||
)
|
||||
assert (
|
||||
"DOCUMENT_CHUNK_SIZE is set to 64 words, which is quite small"
|
||||
in caplog.text
|
||||
)
|
||||
assert "Consider using at least 256 words" in caplog.text
|
||||
|
||||
def test_reasonable_chunk_size_no_warning(self, caplog):
|
||||
"""Test that chunk size >= 100 doesn't trigger warning."""
|
||||
import logging
|
||||
|
||||
caplog.set_level(logging.WARNING, logger="nextcloud_mcp_server.config")
|
||||
Settings(
|
||||
document_chunk_size=256,
|
||||
document_chunk_overlap=25,
|
||||
)
|
||||
assert "DOCUMENT_CHUNK_SIZE" not in caplog.text
|
||||
|
||||
@patch.dict(
|
||||
os.environ,
|
||||
{
|
||||
"DOCUMENT_CHUNK_SIZE": "1024",
|
||||
"DOCUMENT_CHUNK_OVERLAP": "102",
|
||||
},
|
||||
clear=True,
|
||||
)
|
||||
def test_get_settings_chunk_config(self):
|
||||
"""Test get_settings() with chunk configuration."""
|
||||
settings = get_settings()
|
||||
assert settings.document_chunk_size == 1024
|
||||
assert settings.document_chunk_overlap == 102
|
||||
|
||||
@patch.dict(
|
||||
os.environ,
|
||||
{
|
||||
"DOCUMENT_CHUNK_SIZE": "256",
|
||||
"DOCUMENT_CHUNK_OVERLAP": "256",
|
||||
},
|
||||
clear=True,
|
||||
)
|
||||
def test_get_settings_invalid_chunk_config_raises_error(self):
|
||||
"""Test get_settings() raises error for invalid chunk config."""
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
match="DOCUMENT_CHUNK_OVERLAP .* must be less than DOCUMENT_CHUNK_SIZE",
|
||||
):
|
||||
get_settings()
|
||||
|
||||
Reference in New Issue
Block a user