Files
nextcloud-mcp-server/nextcloud_mcp_server/vector/document_chunker.py
T
Chris Coutinho 53689d076b feat: Improve vector visualization with static assets and fixes
- Extract CSS and JavaScript into separate static files
  - Created nextcloud_mcp_server/auth/static/vector-viz.css
  - Created nextcloud_mcp_server/auth/static/vector-viz.js
  - Updated templates to reference external assets

- Fix vector visualization issues:
  - Normalize vectors before PCA to match Qdrant's cosine distance
  - Add zero-norm and NaN detection/handling for large datasets
  - Enable responsive Plotly sizing (autosize + responsive config)
  - Widen plot area to full viewport width with minimized margins

- Improve visualization accuracy:
  - Query point now positioned correctly relative to documents
  - Handles 200+ points without JSON serialization errors
  - Full-width plot maximizes screen space utilization

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-19 04:10:44 +01:00

91 lines
3.2 KiB
Python

"""Document chunking for large texts using LangChain text splitters."""
import logging
from dataclasses import dataclass
from langchain_text_splitters import RecursiveCharacterTextSplitter
logger = logging.getLogger(__name__)
@dataclass
class ChunkWithPosition:
"""A text chunk with its character position in the original document."""
text: str
start_offset: int # Character position where chunk starts
end_offset: int # Character position where chunk ends (exclusive)
class DocumentChunker:
"""Chunk large documents for optimal embedding using LangChain text splitters.
Uses RecursiveCharacterTextSplitter which preserves semantic boundaries
by splitting on sentence and paragraph boundaries before resorting to
character-level splitting.
"""
def __init__(self, chunk_size: int = 2048, overlap: int = 200):
"""
Initialize document chunker.
Args:
chunk_size: Number of characters per chunk (default: 2048)
overlap: Number of overlapping characters between chunks (default: 200)
"""
self.chunk_size = chunk_size
self.overlap = overlap
# Initialize LangChain RecursiveCharacterTextSplitter
# Uses hierarchical splitting to preserve semantic boundaries:
# - Paragraphs (\n\n)
# - Sentences (. ! ?)
# - Words (spaces)
# - Characters (last resort)
# This prevents mid-sentence splitting while maintaining semantic coherence
self.splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
add_start_index=True, # Enable position tracking
strip_whitespace=True,
)
def chunk_text(self, content: str) -> list[ChunkWithPosition]:
"""
Split text into overlapping chunks with position tracking.
Uses LangChain's RecursiveCharacterTextSplitter to create chunks that
preserve semantic boundaries by splitting at paragraphs and sentences
before resorting to word or character-level splitting. This ensures
sentences are kept intact. Preserves character positions for each chunk
to enable precise document retrieval.
Args:
content: Text content to chunk
Returns:
List of chunks with their character positions in the original content
"""
# Handle empty content - return single empty chunk for backward compatibility
if not content:
return [ChunkWithPosition(text="", start_offset=0, end_offset=0)]
# Use LangChain to create documents with position tracking
docs = self.splitter.create_documents([content])
# Convert LangChain Documents to ChunkWithPosition objects
chunks = [
ChunkWithPosition(
text=doc.page_content,
start_offset=doc.metadata.get("start_index", 0),
end_offset=doc.metadata.get("start_index", 0) + len(doc.page_content),
)
for doc in docs
]
logger.debug(
f"Chunked document into {len(chunks)} chunks "
f"(chunk_size={self.chunk_size}, overlap={self.overlap})"
)
return chunks