862308418e
Fixed a critical infinite loop bug in document_chunker.py that occurred when the overlap parameter caused the chunker to not make forward progress. Changes: - Added ChunkWithPosition dataclass to track character positions - Refactored chunk_text() to use regex word matching for accurate position tracking - Added safety check to ensure forward progress (next_start_idx > start_idx) - Changed return type from list[str] to list[ChunkWithPosition] The bug manifested when: 1. end_idx reached len(word_matches) (processing last chunk) 2. next_start_idx = end_idx - overlap would not advance past start_idx 3. Loop would continue indefinitely without making progress Fix ensures chunker always terminates by breaking when not advancing. All 9 unit tests now pass in 1.66s (previously timing out at 180s). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
96 lines
3.0 KiB
Python
96 lines
3.0 KiB
Python
"""Document chunking for large texts."""
|
|
|
|
import logging
|
|
import re
|
|
from dataclasses import dataclass
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ChunkWithPosition:
|
|
"""A text chunk with its character position in the original document."""
|
|
|
|
text: str
|
|
start_offset: int # Character position where chunk starts
|
|
end_offset: int # Character position where chunk ends (exclusive)
|
|
|
|
|
|
class DocumentChunker:
|
|
"""Chunk large documents for optimal embedding."""
|
|
|
|
def __init__(self, chunk_size: int = 512, overlap: int = 50):
|
|
"""
|
|
Initialize document chunker.
|
|
|
|
Args:
|
|
chunk_size: Number of words per chunk (default: 512)
|
|
overlap: Number of overlapping words between chunks (default: 50)
|
|
"""
|
|
self.chunk_size = chunk_size
|
|
self.overlap = overlap
|
|
|
|
def chunk_text(self, content: str) -> list[ChunkWithPosition]:
|
|
"""
|
|
Split text into overlapping chunks with position tracking.
|
|
|
|
Uses simple word-based chunking with configurable overlap to preserve
|
|
context across chunk boundaries. Tracks character positions for each chunk.
|
|
|
|
Args:
|
|
content: Text content to chunk
|
|
|
|
Returns:
|
|
List of chunks with their character positions in the original content
|
|
"""
|
|
# Use regex to find all words and their positions
|
|
# This preserves the original spacing and allows accurate position tracking
|
|
word_pattern = re.compile(r"\S+")
|
|
word_matches = list(word_pattern.finditer(content))
|
|
|
|
if len(word_matches) <= self.chunk_size:
|
|
# Single chunk - use entire content
|
|
return [
|
|
ChunkWithPosition(text=content, start_offset=0, end_offset=len(content))
|
|
]
|
|
|
|
chunks = []
|
|
start_idx = 0
|
|
|
|
while start_idx < len(word_matches):
|
|
end_idx = min(start_idx + self.chunk_size, len(word_matches))
|
|
|
|
# Get the first and last word positions
|
|
first_word = word_matches[start_idx]
|
|
last_word = word_matches[end_idx - 1]
|
|
|
|
# Extract chunk using character positions
|
|
start_offset = first_word.start()
|
|
end_offset = last_word.end()
|
|
chunk_text = content[start_offset:end_offset]
|
|
|
|
chunks.append(
|
|
ChunkWithPosition(
|
|
text=chunk_text, start_offset=start_offset, end_offset=end_offset
|
|
)
|
|
)
|
|
|
|
# If we've reached the end, break
|
|
if end_idx >= len(word_matches):
|
|
break
|
|
|
|
# Move to next chunk with overlap
|
|
next_start_idx = end_idx - self.overlap
|
|
|
|
# Safety check: ensure we're making forward progress
|
|
# If we're not advancing (overlap >= chunk processed), break to prevent infinite loop
|
|
if next_start_idx <= start_idx:
|
|
break
|
|
|
|
start_idx = next_start_idx
|
|
|
|
logger.debug(
|
|
f"Chunked document into {len(chunks)} chunks ({len(word_matches)} words)"
|
|
)
|
|
return chunks
|