11e620f2d1
Creates shared search module with four algorithms implementing ADR-012: - Semantic search (vector similarity via Qdrant) - Keyword search (token-based matching from ADR-001) - Fuzzy search (character overlap matching) - Hybrid search (RRF fusion from ADR-003) Architecture: - Base SearchAlgorithm interface for consistent API - SearchResult dataclass for unified result format - All algorithms async and independently testable - Proper logging and error handling throughout Semantic Search (search/semantic.py): - Extracted from server/semantic.py - Vector similarity using Qdrant query_points - Dual-phase authorization (vector filter + API verification) - Deduplication of document chunks - Configurable score threshold (default: 0.7) Keyword Search (search/keyword.py): - Implements ADR-001 token-based matching - Title matches weighted 3x higher than content - Case-insensitive token matching - Relevance scoring with normalization - Excerpt extraction with context Fuzzy Search (search/fuzzy.py): - Simple character overlap calculation - Configurable threshold (default: 70%) - Typo-tolerant matching - Fast and dependency-free Hybrid Search (search/hybrid.py): - Reciprocal Rank Fusion (RRF) from ADR-003 - Parallel execution of sub-algorithms - Configurable weights per algorithm - RRF constant k=60 (standard value) - Weight validation (must sum ≤1.0) All algorithms: - Share NextcloudClient for document access - Support user_id filtering (multi-tenant) - Support doc_type filtering (currently notes only) - Return consistent SearchResult objects - Properly formatted with ruff and type-checked Next steps: Update MCP tool to use these algorithms 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
241 lines
8.1 KiB
Python
241 lines
8.1 KiB
Python
"""Hybrid search algorithm using Reciprocal Rank Fusion (RRF)."""
|
|
|
|
import asyncio
|
|
import logging
|
|
from collections import defaultdict
|
|
from typing import Any
|
|
|
|
from nextcloud_mcp_server.client import NextcloudClient
|
|
from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
|
|
from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm
|
|
from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm
|
|
from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class HybridSearchAlgorithm(SearchAlgorithm):
|
|
"""Hybrid search combining multiple algorithms using Reciprocal Rank Fusion.
|
|
|
|
Implements RRF from ADR-003 to combine results from:
|
|
- Semantic search (vector similarity)
|
|
- Keyword search (token matching)
|
|
- Fuzzy search (character overlap)
|
|
|
|
RRF formula: score = weight / (k + rank)
|
|
where k=60 (standard value) and rank is 1-indexed position.
|
|
"""
|
|
|
|
DEFAULT_RRF_K = 60 # Standard RRF constant
|
|
|
|
def __init__(
|
|
self,
|
|
semantic_weight: float = 0.5,
|
|
keyword_weight: float = 0.3,
|
|
fuzzy_weight: float = 0.2,
|
|
rrf_k: int = DEFAULT_RRF_K,
|
|
):
|
|
"""Initialize hybrid search with algorithm weights.
|
|
|
|
Args:
|
|
semantic_weight: Weight for semantic results (default: 0.5)
|
|
keyword_weight: Weight for keyword results (default: 0.3)
|
|
fuzzy_weight: Weight for fuzzy results (default: 0.2)
|
|
rrf_k: RRF constant for rank decay (default: 60)
|
|
|
|
Raises:
|
|
ValueError: If weights are invalid
|
|
"""
|
|
# Validate weights
|
|
if semantic_weight < 0 or keyword_weight < 0 or fuzzy_weight < 0:
|
|
raise ValueError("Weights must be non-negative")
|
|
|
|
total_weight = semantic_weight + keyword_weight + fuzzy_weight
|
|
if total_weight > 1.0:
|
|
raise ValueError(f"Weights sum to {total_weight:.2f}, must be ≤1.0")
|
|
|
|
if total_weight == 0.0:
|
|
raise ValueError("At least one weight must be > 0")
|
|
|
|
self.semantic_weight = semantic_weight
|
|
self.keyword_weight = keyword_weight
|
|
self.fuzzy_weight = fuzzy_weight
|
|
self.rrf_k = rrf_k
|
|
|
|
# Initialize sub-algorithms
|
|
self.semantic = SemanticSearchAlgorithm()
|
|
self.keyword = KeywordSearchAlgorithm()
|
|
self.fuzzy = FuzzySearchAlgorithm()
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "hybrid"
|
|
|
|
@property
|
|
def requires_vector_db(self) -> bool:
|
|
# Requires vector DB if semantic search has non-zero weight
|
|
return self.semantic_weight > 0
|
|
|
|
async def search(
|
|
self,
|
|
query: str,
|
|
user_id: str,
|
|
limit: int = 10,
|
|
doc_type: str | None = None,
|
|
nextcloud_client: NextcloudClient | None = None,
|
|
**kwargs: Any,
|
|
) -> list[SearchResult]:
|
|
"""Execute hybrid search using RRF to combine algorithms.
|
|
|
|
Args:
|
|
query: Search query
|
|
user_id: User ID for filtering
|
|
limit: Maximum results to return
|
|
doc_type: Optional document type filter
|
|
nextcloud_client: NextcloudClient for document access
|
|
**kwargs: Additional parameters passed to sub-algorithms
|
|
|
|
Returns:
|
|
List of SearchResult objects ranked by RRF combined score
|
|
|
|
Raises:
|
|
ValueError: If nextcloud_client not provided (needed for keyword/fuzzy)
|
|
"""
|
|
logger.info(
|
|
f"Hybrid search: query='{query}', user={user_id}, limit={limit}, "
|
|
f"weights=(semantic={self.semantic_weight}, keyword={self.keyword_weight}, "
|
|
f"fuzzy={self.fuzzy_weight})"
|
|
)
|
|
|
|
# Run algorithms in parallel
|
|
tasks = []
|
|
algo_names = []
|
|
|
|
if self.semantic_weight > 0:
|
|
tasks.append(
|
|
self.semantic.search(
|
|
query, user_id, limit * 2, doc_type, nextcloud_client, **kwargs
|
|
)
|
|
)
|
|
algo_names.append("semantic")
|
|
|
|
if self.keyword_weight > 0:
|
|
if not nextcloud_client:
|
|
raise ValueError("Hybrid search with keyword requires nextcloud_client")
|
|
tasks.append(
|
|
self.keyword.search(
|
|
query, user_id, limit * 2, doc_type, nextcloud_client, **kwargs
|
|
)
|
|
)
|
|
algo_names.append("keyword")
|
|
|
|
if self.fuzzy_weight > 0:
|
|
if not nextcloud_client:
|
|
raise ValueError("Hybrid search with fuzzy requires nextcloud_client")
|
|
tasks.append(
|
|
self.fuzzy.search(
|
|
query, user_id, limit * 2, doc_type, nextcloud_client, **kwargs
|
|
)
|
|
)
|
|
algo_names.append("fuzzy")
|
|
|
|
# Execute searches in parallel
|
|
results_list = await asyncio.gather(*tasks)
|
|
|
|
# Build results dict
|
|
algo_results = {}
|
|
for algo_name, results in zip(algo_names, results_list):
|
|
algo_results[algo_name] = results
|
|
logger.debug(f"{algo_name} returned {len(results)} results")
|
|
|
|
# Combine using RRF
|
|
combined_results = self._reciprocal_rank_fusion(
|
|
algo_results,
|
|
{
|
|
"semantic": self.semantic_weight,
|
|
"keyword": self.keyword_weight,
|
|
"fuzzy": self.fuzzy_weight,
|
|
},
|
|
limit,
|
|
)
|
|
|
|
logger.info(f"Hybrid search returned {len(combined_results)} combined results")
|
|
if combined_results:
|
|
result_details = [
|
|
f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')"
|
|
for r in combined_results[:5]
|
|
]
|
|
logger.debug(f"Top hybrid results: {', '.join(result_details)}")
|
|
|
|
return combined_results
|
|
|
|
def _reciprocal_rank_fusion(
|
|
self,
|
|
algo_results: dict[str, list[SearchResult]],
|
|
weights: dict[str, float],
|
|
limit: int,
|
|
) -> list[SearchResult]:
|
|
"""Combine multiple ranked result lists using RRF.
|
|
|
|
Args:
|
|
algo_results: Dict of algorithm_name -> ranked results
|
|
weights: Dict of algorithm_name -> weight (0-1)
|
|
limit: Maximum results to return
|
|
|
|
Returns:
|
|
Combined and re-ranked results
|
|
"""
|
|
# Track RRF scores per document
|
|
rrf_scores: dict[tuple[int, str], float] = defaultdict(float)
|
|
# Track best result object for each document
|
|
best_results: dict[tuple[int, str], SearchResult] = {}
|
|
|
|
for algo_name, results in algo_results.items():
|
|
weight = weights.get(algo_name, 0.0)
|
|
if weight == 0:
|
|
continue
|
|
|
|
for rank, result in enumerate(results, start=1):
|
|
doc_key = (result.id, result.doc_type)
|
|
|
|
# RRF formula: weight / (k + rank)
|
|
rrf_score = weight / (self.rrf_k + rank)
|
|
rrf_scores[doc_key] += rrf_score
|
|
|
|
# Track best result object (prefer higher original scores)
|
|
if doc_key not in best_results:
|
|
best_results[doc_key] = result
|
|
elif result.score > best_results[doc_key].score:
|
|
best_results[doc_key] = result
|
|
|
|
# Sort by combined RRF score
|
|
sorted_docs = sorted(
|
|
rrf_scores.items(),
|
|
key=lambda x: x[1],
|
|
reverse=True,
|
|
)[:limit]
|
|
|
|
# Build final results with RRF scores
|
|
final_results = []
|
|
for doc_key, rrf_score in sorted_docs:
|
|
result = best_results[doc_key]
|
|
|
|
# Create new result with RRF score
|
|
# Keep original metadata but add RRF details
|
|
metadata = result.metadata or {}
|
|
metadata["rrf_score"] = rrf_score
|
|
metadata["original_score"] = result.score
|
|
|
|
final_results.append(
|
|
SearchResult(
|
|
id=result.id,
|
|
doc_type=result.doc_type,
|
|
title=result.title,
|
|
excerpt=result.excerpt,
|
|
score=rrf_score, # Use RRF score as the primary score
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
return final_results
|