fffe483c02
Previously, pymupdf4llm.to_markdown() was called twice - once in PyMuPDFProcessor during indexing and again in PDFHighlighter during visualization. Different image path lengths caused different character offsets, leading to highlighted pages not matching their chunks. Also fixed issue where all chunks on the same page showed all highlights instead of just their own highlight. Now restores original page contents between chunks using xref stream caching. Changes: - Add PDFHighlighter class requiring pre-computed page_boundaries and full_text from document processor (no fallback extraction) - Pass pre-computed data from processor to highlighter - Extract page-relative portion of chunk text for cross-page chunks - Add bounding box highlighting using text anchor search - Run highlight generation in parallel with embedding/BM25 - Cache and restore page contents to isolate highlights per chunk Results: Highlighting success rate improved from 51% to 95% (121/128). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
98 lines
3.5 KiB
Python
98 lines
3.5 KiB
Python
"""Document chunking for large texts using LangChain text splitters."""
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ChunkWithPosition:
|
|
"""A text chunk with its character position in the original document."""
|
|
|
|
text: str
|
|
start_offset: int # Character position where chunk starts
|
|
end_offset: int # Character position where chunk ends (exclusive)
|
|
page_number: int | None = None # Page number for PDF chunks (optional)
|
|
metadata: dict | None = None # Additional processor-specific metadata (optional)
|
|
|
|
|
|
class DocumentChunker:
|
|
"""Chunk large documents for optimal embedding using LangChain text splitters.
|
|
|
|
Uses RecursiveCharacterTextSplitter which preserves semantic boundaries
|
|
by splitting on sentence and paragraph boundaries before resorting to
|
|
character-level splitting.
|
|
"""
|
|
|
|
def __init__(self, chunk_size: int = 2048, overlap: int = 200):
|
|
"""
|
|
Initialize document chunker.
|
|
|
|
Args:
|
|
chunk_size: Number of characters per chunk (default: 2048)
|
|
overlap: Number of overlapping characters between chunks (default: 200)
|
|
"""
|
|
self.chunk_size = chunk_size
|
|
self.overlap = overlap
|
|
|
|
# Initialize LangChain RecursiveCharacterTextSplitter
|
|
# Uses hierarchical splitting to preserve semantic boundaries:
|
|
# - Paragraphs (\n\n)
|
|
# - Sentences (. ! ?)
|
|
# - Words (spaces)
|
|
# - Characters (last resort)
|
|
# This prevents mid-sentence splitting while maintaining semantic coherence
|
|
self.splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=overlap,
|
|
add_start_index=True, # Enable position tracking
|
|
strip_whitespace=True,
|
|
)
|
|
|
|
async def chunk_text(self, content: str) -> list[ChunkWithPosition]:
|
|
"""
|
|
Split text into overlapping chunks with position tracking.
|
|
|
|
Uses LangChain's RecursiveCharacterTextSplitter to create chunks that
|
|
preserve semantic boundaries by splitting at paragraphs and sentences
|
|
before resorting to word or character-level splitting. This ensures
|
|
sentences are kept intact. Preserves character positions for each chunk
|
|
to enable precise document retrieval.
|
|
|
|
Args:
|
|
content: Text content to chunk
|
|
|
|
Returns:
|
|
List of chunks with their character positions in the original content
|
|
"""
|
|
import anyio
|
|
|
|
# Handle empty content - return single empty chunk for backward compatibility
|
|
if not content:
|
|
return [ChunkWithPosition(text="", start_offset=0, end_offset=0)]
|
|
|
|
# Run CPU-bound text splitting in thread pool to avoid blocking event loop
|
|
docs = await anyio.to_thread.run_sync( # type: ignore[attr-defined]
|
|
self.splitter.create_documents,
|
|
[content],
|
|
)
|
|
|
|
# Convert LangChain Documents to ChunkWithPosition objects
|
|
chunks = [
|
|
ChunkWithPosition(
|
|
text=doc.page_content,
|
|
start_offset=doc.metadata.get("start_index", 0),
|
|
end_offset=doc.metadata.get("start_index", 0) + len(doc.page_content),
|
|
)
|
|
for doc in docs
|
|
]
|
|
|
|
logger.debug(
|
|
f"Chunked document into {len(chunks)} chunks "
|
|
f"(chunk_size={self.chunk_size}, overlap={self.overlap})"
|
|
)
|
|
return chunks
|