b8010270c1
This commit addresses multiple issues with async operations, PDF metadata extraction, and type safety in document processing and search. ## Async/Await Fixes - processor.py:259 - Added await for chunker.chunk_text(content) - processor.py:270 - Added await for bm25_service.encode_batch(chunk_texts) - tests/unit/test_document_chunker.py - Converted all 12 test methods to async ## PDF Metadata Enhancement - pymupdf.py:143 - Added file_size metadata extraction - pymupdf.py:145-206 - Refactored to extract text page-by-page - Manually loop through pages instead of using page_chunks=True - Generate page_boundaries metadata for precise page tracking - Works around pymupdf.layout.activate() breaking page_chunks=True - processor.py:32-66 - Added assign_page_numbers() helper function - Assigns page numbers to chunks based on overlap with page boundaries - Handles chunks spanning multiple pages - processor.py:298-300 - Call assign_page_numbers() for PDF files ## Type Safety Fixes - bm25_hybrid.py:184 - Removed int() conversion of doc_id - semantic.py:131 - Removed int() conversion of doc_id - viz_routes.py:275 - Removed int() conversion of doc_id - Added comments documenting that doc_id can be int (notes) or str (file paths) ## Testing - All 18 tests passing (12 unit + 6 integration) - No type errors in modified files - Container logs show successful processing - Vector viz searches working correctly 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
98 lines
3.5 KiB
Python
98 lines
3.5 KiB
Python
"""Document chunking for large texts using LangChain text splitters."""
|
|
|
|
import logging
|
|
from dataclasses import dataclass
|
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@dataclass
|
|
class ChunkWithPosition:
|
|
"""A text chunk with its character position in the original document."""
|
|
|
|
text: str
|
|
start_offset: int # Character position where chunk starts
|
|
end_offset: int # Character position where chunk ends (exclusive)
|
|
page_number: int | None = None # Page number for PDF chunks (optional)
|
|
metadata: dict | None = None # Additional processor-specific metadata (optional)
|
|
|
|
|
|
class DocumentChunker:
|
|
"""Chunk large documents for optimal embedding using LangChain text splitters.
|
|
|
|
Uses RecursiveCharacterTextSplitter which preserves semantic boundaries
|
|
by splitting on sentence and paragraph boundaries before resorting to
|
|
character-level splitting.
|
|
"""
|
|
|
|
def __init__(self, chunk_size: int = 2048, overlap: int = 200):
|
|
"""
|
|
Initialize document chunker.
|
|
|
|
Args:
|
|
chunk_size: Number of characters per chunk (default: 2048)
|
|
overlap: Number of overlapping characters between chunks (default: 200)
|
|
"""
|
|
self.chunk_size = chunk_size
|
|
self.overlap = overlap
|
|
|
|
# Initialize LangChain RecursiveCharacterTextSplitter
|
|
# Uses hierarchical splitting to preserve semantic boundaries:
|
|
# - Paragraphs (\n\n)
|
|
# - Sentences (. ! ?)
|
|
# - Words (spaces)
|
|
# - Characters (last resort)
|
|
# This prevents mid-sentence splitting while maintaining semantic coherence
|
|
self.splitter = RecursiveCharacterTextSplitter(
|
|
chunk_size=chunk_size,
|
|
chunk_overlap=overlap,
|
|
add_start_index=True, # Enable position tracking
|
|
strip_whitespace=True,
|
|
)
|
|
|
|
async def chunk_text(self, content: str) -> list[ChunkWithPosition]:
|
|
"""
|
|
Split text into overlapping chunks with position tracking.
|
|
|
|
Uses LangChain's RecursiveCharacterTextSplitter to create chunks that
|
|
preserve semantic boundaries by splitting at paragraphs and sentences
|
|
before resorting to word or character-level splitting. This ensures
|
|
sentences are kept intact. Preserves character positions for each chunk
|
|
to enable precise document retrieval.
|
|
|
|
Args:
|
|
content: Text content to chunk
|
|
|
|
Returns:
|
|
List of chunks with their character positions in the original content
|
|
"""
|
|
import anyio
|
|
|
|
# Handle empty content - return single empty chunk for backward compatibility
|
|
if not content:
|
|
return [ChunkWithPosition(text="", start_offset=0, end_offset=0)]
|
|
|
|
# Run CPU-bound text splitting in thread pool to avoid blocking event loop
|
|
docs = await anyio.to_thread.run_sync(
|
|
self.splitter.create_documents,
|
|
[content],
|
|
)
|
|
|
|
# Convert LangChain Documents to ChunkWithPosition objects
|
|
chunks = [
|
|
ChunkWithPosition(
|
|
text=doc.page_content,
|
|
start_offset=doc.metadata.get("start_index", 0),
|
|
end_offset=doc.metadata.get("start_index", 0) + len(doc.page_content),
|
|
)
|
|
for doc in docs
|
|
]
|
|
|
|
logger.debug(
|
|
f"Chunked document into {len(chunks)} chunks "
|
|
f"(chunk_size={self.chunk_size}, overlap={self.overlap})"
|
|
)
|
|
return chunks
|