feat: add chunk position tracking to vector indexing and search

Track character offsets (start_offset, end_offset) for each chunk in vector database metadata, enabling precise chunk highlighting in visualization pane. Changes: - processor.py: Store chunk_start_offset and chunk_end_offset in Qdrant metadata - processor.py: Added metadata_version=2 to indicate position tracking support - search/semantic.py: Return chunk positions from search results - server/semantic.py: Expose chunk positions in API responses (SemanticSearchResult) Enables viz pane to: 1. Display exact matched chunk with surrounding context 2. Highlight the precise portion of text that matched the query 3. Build user trust by showing what the RAG system actually retrieved Position tracking uses ChunkWithPosition dataclass from document_chunker.py which provides character-accurate offsets in the original document. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-17 06:47:58 +01:00
parent c3282534eb
commit 3aa7128f45
3 changed files with 29 additions and 10 deletions
@@ -150,6 +150,8 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
                        "chunk_index": result.payload.get("chunk_index"),
                        "total_chunks": result.payload.get("total_chunks"),
                    },
+                    chunk_start_offset=result.payload.get("chunk_start_offset"),
+                    chunk_end_offset=result.payload.get("chunk_end_offset"),
                )
            )

@@ -42,6 +42,7 @@ def configure_semantic_tools(mcp: FastMCP):
        limit: int = 10,
        doc_types: list[str] | None = None,
        score_threshold: float = 0.0,
+        fusion: str = "rrf",
    ) -> SemanticSearchResponse:
        """
        Search Nextcloud content using BM25 hybrid search with cross-app support.
@@ -50,7 +51,7 @@ def configure_semantic_tools(mcp: FastMCP):
        - Dense semantic vectors: For conceptual similarity and natural language queries
        - BM25 sparse vectors: For precise keyword matching, acronyms, and specific terms

-        Results are automatically fused using Reciprocal Rank Fusion (RRF) in the
+        Results are automatically fused using the selected fusion algorithm in the
        database for optimal relevance. This provides the best of both semantic
        understanding and keyword precision.

@@ -61,10 +62,13 @@ def configure_semantic_tools(mcp: FastMCP):
            query: Natural language or keyword search query
            limit: Maximum number of results to return (default: 10)
            doc_types: Document types to search (e.g., ["note", "file"]). None = search all indexed types (default)
-            score_threshold: Minimum RRF fusion score (0-1, default: 0.0 for RRF scoring)
+            score_threshold: Minimum fusion score (0-1, default: 0.0)
+            fusion: Fusion algorithm: "rrf" (Reciprocal Rank Fusion, default) or "dbsf" (Distribution-Based Score Fusion)
+                   RRF: Good general-purpose fusion using reciprocal ranks
+                   DBSF: Uses distribution-based normalization, may better balance different score ranges

        Returns:
-            SemanticSearchResponse with matching documents ranked by RRF fusion scores
+            SemanticSearchResponse with matching documents ranked by fusion scores
        """
        from nextcloud_mcp_server.config import get_settings

@@ -74,7 +78,7 @@ def configure_semantic_tools(mcp: FastMCP):

        logger.info(
            f"BM25 hybrid search: query='{query}', user={username}, "
-            f"limit={limit}, score_threshold={score_threshold}"
+            f"limit={limit}, score_threshold={score_threshold}, fusion={fusion}"
        )

        # Check that vector sync is enabled
@@ -87,8 +91,10 @@ def configure_semantic_tools(mcp: FastMCP):
            )

        try:
-            # Create BM25 hybrid search algorithm
-            search_algo = BM25HybridSearchAlgorithm(score_threshold=score_threshold)
+            # Create BM25 hybrid search algorithm with specified fusion
+            search_algo = BM25HybridSearchAlgorithm(
+                score_threshold=score_threshold, fusion=fusion
+            )

            # Execute search across requested document types
            # If doc_types is None, search all indexed types (cross-app search)
@@ -152,6 +158,8 @@ def configure_semantic_tools(mcp: FastMCP):
                        total_chunks=r.metadata.get("total_chunks", 1)
                        if r.metadata
                        else 1,
+                        chunk_start_offset=r.chunk_start_offset,
+                        chunk_end_offset=r.chunk_end_offset,
                    )
                )

@@ -161,7 +169,7 @@ def configure_semantic_tools(mcp: FastMCP):
                results=results,
                query=query,
                total_found=len(results),
-                search_method="bm25_hybrid",
+                search_method=f"bm25_hybrid_{fusion}",
            )

        except ValueError as e:
@@ -193,6 +201,7 @@ def configure_semantic_tools(mcp: FastMCP):
        limit: int = 5,
        score_threshold: float = 0.7,
        max_answer_tokens: int = 500,
+        fusion: str = "rrf",
    ) -> SamplingSearchResponse:
        """
        Semantic search with LLM-generated answer using MCP sampling.
@@ -217,6 +226,7 @@ def configure_semantic_tools(mcp: FastMCP):
            limit: Maximum number of documents to retrieve (default: 5)
            score_threshold: Minimum similarity score 0-1 (default: 0.7)
            max_answer_tokens: Maximum tokens for generated answer (default: 500)
+            fusion: Fusion algorithm: "rrf" (Reciprocal Rank Fusion, default) or "dbsf" (Distribution-Based Score Fusion)

        Returns:
            SamplingSearchResponse containing:
@@ -256,6 +266,7 @@ def configure_semantic_tools(mcp: FastMCP):
            ctx=ctx,
            limit=limit,
            score_threshold=score_threshold,
+            fusion=fusion,
        )

        # 2. Handle no results case - don't waste a sampling call
@@ -233,13 +233,16 @@ async def _index_document(
    )
    chunks = chunker.chunk_text(content)

+    # Extract chunk texts for embedding
+    chunk_texts = [chunk.text for chunk in chunks]
+
    # Generate dense embeddings (I/O bound - external API call)
    embedding_service = get_embedding_service()
-    dense_embeddings = await embedding_service.embed_batch(chunks)
+    dense_embeddings = await embedding_service.embed_batch(chunk_texts)

    # Generate sparse embeddings (BM25 for keyword matching)
    bm25_service = get_bm25_service()
-    sparse_embeddings = bm25_service.encode_batch(chunks)
+    sparse_embeddings = bm25_service.encode_batch(chunk_texts)

    # Prepare Qdrant points
    indexed_at = int(time.time())
@@ -265,12 +268,15 @@ async def _index_document(
                    "doc_id": doc_task.doc_id,
                    "doc_type": doc_task.doc_type,
                    "title": title,
-                    "excerpt": chunk[:200],
+                    "excerpt": chunk.text[:200],
                    "indexed_at": indexed_at,
                    "modified_at": doc_task.modified_at,
                    "etag": etag,
                    "chunk_index": i,
                    "total_chunks": len(chunks),
+                    "chunk_start_offset": chunk.start_offset,
+                    "chunk_end_offset": chunk.end_offset,
+                    "metadata_version": 2,  # v2 includes position metadata
                },
            )
        )