diff --git a/nextcloud_mcp_server/search/semantic.py b/nextcloud_mcp_server/search/semantic.py index 90236ac..89e9921 100644 --- a/nextcloud_mcp_server/search/semantic.py +++ b/nextcloud_mcp_server/search/semantic.py @@ -150,6 +150,8 @@ class SemanticSearchAlgorithm(SearchAlgorithm): "chunk_index": result.payload.get("chunk_index"), "total_chunks": result.payload.get("total_chunks"), }, + chunk_start_offset=result.payload.get("chunk_start_offset"), + chunk_end_offset=result.payload.get("chunk_end_offset"), ) ) diff --git a/nextcloud_mcp_server/server/semantic.py b/nextcloud_mcp_server/server/semantic.py index 2f8fde6..0ff76da 100644 --- a/nextcloud_mcp_server/server/semantic.py +++ b/nextcloud_mcp_server/server/semantic.py @@ -42,6 +42,7 @@ def configure_semantic_tools(mcp: FastMCP): limit: int = 10, doc_types: list[str] | None = None, score_threshold: float = 0.0, + fusion: str = "rrf", ) -> SemanticSearchResponse: """ Search Nextcloud content using BM25 hybrid search with cross-app support. @@ -50,7 +51,7 @@ def configure_semantic_tools(mcp: FastMCP): - Dense semantic vectors: For conceptual similarity and natural language queries - BM25 sparse vectors: For precise keyword matching, acronyms, and specific terms - Results are automatically fused using Reciprocal Rank Fusion (RRF) in the + Results are automatically fused using the selected fusion algorithm in the database for optimal relevance. This provides the best of both semantic understanding and keyword precision. @@ -61,10 +62,13 @@ def configure_semantic_tools(mcp: FastMCP): query: Natural language or keyword search query limit: Maximum number of results to return (default: 10) doc_types: Document types to search (e.g., ["note", "file"]). None = search all indexed types (default) - score_threshold: Minimum RRF fusion score (0-1, default: 0.0 for RRF scoring) + score_threshold: Minimum fusion score (0-1, default: 0.0) + fusion: Fusion algorithm: "rrf" (Reciprocal Rank Fusion, default) or "dbsf" (Distribution-Based Score Fusion) + RRF: Good general-purpose fusion using reciprocal ranks + DBSF: Uses distribution-based normalization, may better balance different score ranges Returns: - SemanticSearchResponse with matching documents ranked by RRF fusion scores + SemanticSearchResponse with matching documents ranked by fusion scores """ from nextcloud_mcp_server.config import get_settings @@ -74,7 +78,7 @@ def configure_semantic_tools(mcp: FastMCP): logger.info( f"BM25 hybrid search: query='{query}', user={username}, " - f"limit={limit}, score_threshold={score_threshold}" + f"limit={limit}, score_threshold={score_threshold}, fusion={fusion}" ) # Check that vector sync is enabled @@ -87,8 +91,10 @@ def configure_semantic_tools(mcp: FastMCP): ) try: - # Create BM25 hybrid search algorithm - search_algo = BM25HybridSearchAlgorithm(score_threshold=score_threshold) + # Create BM25 hybrid search algorithm with specified fusion + search_algo = BM25HybridSearchAlgorithm( + score_threshold=score_threshold, fusion=fusion + ) # Execute search across requested document types # If doc_types is None, search all indexed types (cross-app search) @@ -152,6 +158,8 @@ def configure_semantic_tools(mcp: FastMCP): total_chunks=r.metadata.get("total_chunks", 1) if r.metadata else 1, + chunk_start_offset=r.chunk_start_offset, + chunk_end_offset=r.chunk_end_offset, ) ) @@ -161,7 +169,7 @@ def configure_semantic_tools(mcp: FastMCP): results=results, query=query, total_found=len(results), - search_method="bm25_hybrid", + search_method=f"bm25_hybrid_{fusion}", ) except ValueError as e: @@ -193,6 +201,7 @@ def configure_semantic_tools(mcp: FastMCP): limit: int = 5, score_threshold: float = 0.7, max_answer_tokens: int = 500, + fusion: str = "rrf", ) -> SamplingSearchResponse: """ Semantic search with LLM-generated answer using MCP sampling. @@ -217,6 +226,7 @@ def configure_semantic_tools(mcp: FastMCP): limit: Maximum number of documents to retrieve (default: 5) score_threshold: Minimum similarity score 0-1 (default: 0.7) max_answer_tokens: Maximum tokens for generated answer (default: 500) + fusion: Fusion algorithm: "rrf" (Reciprocal Rank Fusion, default) or "dbsf" (Distribution-Based Score Fusion) Returns: SamplingSearchResponse containing: @@ -256,6 +266,7 @@ def configure_semantic_tools(mcp: FastMCP): ctx=ctx, limit=limit, score_threshold=score_threshold, + fusion=fusion, ) # 2. Handle no results case - don't waste a sampling call diff --git a/nextcloud_mcp_server/vector/processor.py b/nextcloud_mcp_server/vector/processor.py index 12481f2..ba32135 100644 --- a/nextcloud_mcp_server/vector/processor.py +++ b/nextcloud_mcp_server/vector/processor.py @@ -233,13 +233,16 @@ async def _index_document( ) chunks = chunker.chunk_text(content) + # Extract chunk texts for embedding + chunk_texts = [chunk.text for chunk in chunks] + # Generate dense embeddings (I/O bound - external API call) embedding_service = get_embedding_service() - dense_embeddings = await embedding_service.embed_batch(chunks) + dense_embeddings = await embedding_service.embed_batch(chunk_texts) # Generate sparse embeddings (BM25 for keyword matching) bm25_service = get_bm25_service() - sparse_embeddings = bm25_service.encode_batch(chunks) + sparse_embeddings = bm25_service.encode_batch(chunk_texts) # Prepare Qdrant points indexed_at = int(time.time()) @@ -265,12 +268,15 @@ async def _index_document( "doc_id": doc_task.doc_id, "doc_type": doc_task.doc_type, "title": title, - "excerpt": chunk[:200], + "excerpt": chunk.text[:200], "indexed_at": indexed_at, "modified_at": doc_task.modified_at, "etag": etag, "chunk_index": i, "total_chunks": len(chunks), + "chunk_start_offset": chunk.start_offset, + "chunk_end_offset": chunk.end_offset, + "metadata_version": 2, # v2 includes position metadata }, ) )