feat: add chunk position tracking to vector indexing and search

Track character offsets (start_offset, end_offset) for each chunk in vector
database metadata, enabling precise chunk highlighting in visualization pane.

Changes:
- processor.py: Store chunk_start_offset and chunk_end_offset in Qdrant metadata
- processor.py: Added metadata_version=2 to indicate position tracking support
- search/semantic.py: Return chunk positions from search results
- server/semantic.py: Expose chunk positions in API responses (SemanticSearchResult)

Enables viz pane to:
1. Display exact matched chunk with surrounding context
2. Highlight the precise portion of text that matched the query
3. Build user trust by showing what the RAG system actually retrieved

Position tracking uses ChunkWithPosition dataclass from document_chunker.py
which provides character-accurate offsets in the original document.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Chris Coutinho
2025-11-17 06:47:58 +01:00
parent c3282534eb
commit 3aa7128f45
3 changed files with 29 additions and 10 deletions
+2
View File
@@ -150,6 +150,8 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
"chunk_index": result.payload.get("chunk_index"),
"total_chunks": result.payload.get("total_chunks"),
},
chunk_start_offset=result.payload.get("chunk_start_offset"),
chunk_end_offset=result.payload.get("chunk_end_offset"),
)
)
+18 -7
View File
@@ -42,6 +42,7 @@ def configure_semantic_tools(mcp: FastMCP):
limit: int = 10,
doc_types: list[str] | None = None,
score_threshold: float = 0.0,
fusion: str = "rrf",
) -> SemanticSearchResponse:
"""
Search Nextcloud content using BM25 hybrid search with cross-app support.
@@ -50,7 +51,7 @@ def configure_semantic_tools(mcp: FastMCP):
- Dense semantic vectors: For conceptual similarity and natural language queries
- BM25 sparse vectors: For precise keyword matching, acronyms, and specific terms
Results are automatically fused using Reciprocal Rank Fusion (RRF) in the
Results are automatically fused using the selected fusion algorithm in the
database for optimal relevance. This provides the best of both semantic
understanding and keyword precision.
@@ -61,10 +62,13 @@ def configure_semantic_tools(mcp: FastMCP):
query: Natural language or keyword search query
limit: Maximum number of results to return (default: 10)
doc_types: Document types to search (e.g., ["note", "file"]). None = search all indexed types (default)
score_threshold: Minimum RRF fusion score (0-1, default: 0.0 for RRF scoring)
score_threshold: Minimum fusion score (0-1, default: 0.0)
fusion: Fusion algorithm: "rrf" (Reciprocal Rank Fusion, default) or "dbsf" (Distribution-Based Score Fusion)
RRF: Good general-purpose fusion using reciprocal ranks
DBSF: Uses distribution-based normalization, may better balance different score ranges
Returns:
SemanticSearchResponse with matching documents ranked by RRF fusion scores
SemanticSearchResponse with matching documents ranked by fusion scores
"""
from nextcloud_mcp_server.config import get_settings
@@ -74,7 +78,7 @@ def configure_semantic_tools(mcp: FastMCP):
logger.info(
f"BM25 hybrid search: query='{query}', user={username}, "
f"limit={limit}, score_threshold={score_threshold}"
f"limit={limit}, score_threshold={score_threshold}, fusion={fusion}"
)
# Check that vector sync is enabled
@@ -87,8 +91,10 @@ def configure_semantic_tools(mcp: FastMCP):
)
try:
# Create BM25 hybrid search algorithm
search_algo = BM25HybridSearchAlgorithm(score_threshold=score_threshold)
# Create BM25 hybrid search algorithm with specified fusion
search_algo = BM25HybridSearchAlgorithm(
score_threshold=score_threshold, fusion=fusion
)
# Execute search across requested document types
# If doc_types is None, search all indexed types (cross-app search)
@@ -152,6 +158,8 @@ def configure_semantic_tools(mcp: FastMCP):
total_chunks=r.metadata.get("total_chunks", 1)
if r.metadata
else 1,
chunk_start_offset=r.chunk_start_offset,
chunk_end_offset=r.chunk_end_offset,
)
)
@@ -161,7 +169,7 @@ def configure_semantic_tools(mcp: FastMCP):
results=results,
query=query,
total_found=len(results),
search_method="bm25_hybrid",
search_method=f"bm25_hybrid_{fusion}",
)
except ValueError as e:
@@ -193,6 +201,7 @@ def configure_semantic_tools(mcp: FastMCP):
limit: int = 5,
score_threshold: float = 0.7,
max_answer_tokens: int = 500,
fusion: str = "rrf",
) -> SamplingSearchResponse:
"""
Semantic search with LLM-generated answer using MCP sampling.
@@ -217,6 +226,7 @@ def configure_semantic_tools(mcp: FastMCP):
limit: Maximum number of documents to retrieve (default: 5)
score_threshold: Minimum similarity score 0-1 (default: 0.7)
max_answer_tokens: Maximum tokens for generated answer (default: 500)
fusion: Fusion algorithm: "rrf" (Reciprocal Rank Fusion, default) or "dbsf" (Distribution-Based Score Fusion)
Returns:
SamplingSearchResponse containing:
@@ -256,6 +266,7 @@ def configure_semantic_tools(mcp: FastMCP):
ctx=ctx,
limit=limit,
score_threshold=score_threshold,
fusion=fusion,
)
# 2. Handle no results case - don't waste a sampling call
+9 -3
View File
@@ -233,13 +233,16 @@ async def _index_document(
)
chunks = chunker.chunk_text(content)
# Extract chunk texts for embedding
chunk_texts = [chunk.text for chunk in chunks]
# Generate dense embeddings (I/O bound - external API call)
embedding_service = get_embedding_service()
dense_embeddings = await embedding_service.embed_batch(chunks)
dense_embeddings = await embedding_service.embed_batch(chunk_texts)
# Generate sparse embeddings (BM25 for keyword matching)
bm25_service = get_bm25_service()
sparse_embeddings = bm25_service.encode_batch(chunks)
sparse_embeddings = bm25_service.encode_batch(chunk_texts)
# Prepare Qdrant points
indexed_at = int(time.time())
@@ -265,12 +268,15 @@ async def _index_document(
"doc_id": doc_task.doc_id,
"doc_type": doc_task.doc_type,
"title": title,
"excerpt": chunk[:200],
"excerpt": chunk.text[:200],
"indexed_at": indexed_at,
"modified_at": doc_task.modified_at,
"etag": etag,
"chunk_index": i,
"total_chunks": len(chunks),
"chunk_start_offset": chunk.start_offset,
"chunk_end_offset": chunk.end_offset,
"metadata_version": 2, # v2 includes position metadata
},
)
)