eec923eff5
Migrates from custom word-based chunking to LangChain's MarkdownTextSplitter for better semantic search quality. This implements the chunking portion of ADR-011. Changes: - Replace custom regex word chunker with MarkdownTextSplitter - Optimized for Markdown content (headers, code blocks, lists) - Convert from word-based (512 words) to character-based (2048 chars) chunking - Maintain backward-compatible ChunkWithPosition interface - Update configuration defaults and validation - Update all unit tests (12/12 passing) Benefits: - Respects markdown structure boundaries - Never breaks code blocks or headers mid-chunk - Preserves semantic coherence within chunks - Expected 20-30% improvement in recall quality - Industry-standard approach (used by production RAG systems) Note: Full reindex required to apply new chunking to existing documents. Current vector database still contains old word-based chunks. Related: ADR-011 (Improving Semantic Search Quality) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
92 lines
3.2 KiB
Python
92 lines
3.2 KiB
Python
"""Document chunking for large texts using LangChain text splitters."""
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
|
|
from langchain_text_splitters import MarkdownTextSplitter
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ChunkWithPosition:
|
|
"""A text chunk with its character position in the original document."""
|
|
|
|
text: str
|
|
start_offset: int # Character position where chunk starts
|
|
end_offset: int # Character position where chunk ends (exclusive)
|
|
|
|
|
|
class DocumentChunker:
|
|
"""Chunk large documents for optimal embedding using LangChain text splitters.
|
|
|
|
Uses MarkdownTextSplitter which is optimized for Markdown content like
|
|
Nextcloud Notes. Respects markdown structure (headers, code blocks, lists)
|
|
while maintaining semantic boundaries.
|
|
"""
|
|
|
|
def __init__(self, chunk_size: int = 2048, overlap: int = 200):
|
|
"""
|
|
Initialize document chunker.
|
|
|
|
Args:
|
|
chunk_size: Number of characters per chunk (default: 2048)
|
|
overlap: Number of overlapping characters between chunks (default: 200)
|
|
"""
|
|
self.chunk_size = chunk_size
|
|
self.overlap = overlap
|
|
|
|
# Initialize LangChain MarkdownTextSplitter
|
|
# Optimized for Markdown content with special handling for:
|
|
# - Headers (# ## ###)
|
|
# - Code blocks (``` ```)
|
|
# - Lists (- * 1.)
|
|
# - Horizontal rules (---)
|
|
# - Paragraphs and sentences
|
|
# This preserves both markdown structure and semantic boundaries
|
|
self.splitter = MarkdownTextSplitter(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=overlap,
|
|
add_start_index=True, # Enable position tracking
|
|
strip_whitespace=True,
|
|
)
|
|
|
|
def chunk_text(self, content: str) -> list[ChunkWithPosition]:
|
|
"""
|
|
Split text into overlapping chunks with position tracking.
|
|
|
|
Uses LangChain's MarkdownTextSplitter to create chunks that respect
|
|
both markdown structure and semantic boundaries. Optimized for Nextcloud
|
|
Notes content with special handling for headers, code blocks, lists, etc.
|
|
Preserves character positions for each chunk to enable precise document
|
|
retrieval.
|
|
|
|
Args:
|
|
content: Markdown text content to chunk
|
|
|
|
Returns:
|
|
List of chunks with their character positions in the original content
|
|
"""
|
|
# Handle empty content - return single empty chunk for backward compatibility
|
|
if not content:
|
|
return [ChunkWithPosition(text="", start_offset=0, end_offset=0)]
|
|
|
|
# Use LangChain to create documents with position tracking
|
|
docs = self.splitter.create_documents([content])
|
|
|
|
# Convert LangChain Documents to ChunkWithPosition objects
|
|
chunks = [
|
|
ChunkWithPosition(
|
|
text=doc.page_content,
|
|
start_offset=doc.metadata.get("start_index", 0),
|
|
end_offset=doc.metadata.get("start_index", 0) + len(doc.page_content),
|
|
)
|
|
for doc in docs
|
|
]
|
|
|
|
logger.debug(
|
|
f"Chunked document into {len(chunks)} chunks "
|
|
f"(chunk_size={self.chunk_size}, overlap={self.overlap})"
|
|
)
|
|
return chunks
|