6fe5596c13
Replace custom keyword/fuzzy search algorithms with industry-standard BM25 sparse vectors, combined with dense semantic vectors using Qdrant's native Reciprocal Rank Fusion (RRF). This consolidates search architecture and improves relevance for both semantic and keyword queries. Key changes: - Add fastembed dependency for BM25 sparse vector generation - Update Qdrant collection schema to support named vectors (dense + sparse) - Create BM25SparseEmbeddingProvider using FastEmbed's Qdrant/bm25 model - Implement BM25HybridSearchAlgorithm with native Qdrant RRF prefetch - Update document processor to generate both dense and sparse embeddings - Simplify nc_semantic_search() tool to use BM25 hybrid only - Remove legacy keyword.py, fuzzy.py, and custom hybrid.py (736 lines) - Update ADR-014 with implementation notes and test results Benefits: - Consolidated architecture (single Qdrant database) - Native database-level RRF fusion (more efficient) - Industry-standard BM25 (replaces brittle custom keyword search) - Better relevance across semantic and keyword queries - Simplified codebase (-285 net lines) Tests: All 125 tests passing (118 unit, 7 integration) Implements ADR-014: Replace Custom Keyword Search with BM25 Hybrid Search 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
207 lines
7.6 KiB
Python
207 lines
7.6 KiB
Python
"""BM25 hybrid search algorithm using Qdrant native RRF fusion."""
|
|
|
|
import logging
|
|
from typing import Any
|
|
|
|
from qdrant_client import models
|
|
from qdrant_client.models import FieldCondition, Filter, MatchValue
|
|
|
|
from nextcloud_mcp_server.config import get_settings
|
|
from nextcloud_mcp_server.embedding import get_bm25_service, get_embedding_service
|
|
from nextcloud_mcp_server.observability.metrics import record_qdrant_operation
|
|
from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
|
|
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BM25HybridSearchAlgorithm(SearchAlgorithm):
|
|
"""
|
|
Hybrid search combining dense semantic vectors with BM25 sparse vectors.
|
|
|
|
Uses Qdrant's native Reciprocal Rank Fusion (RRF) to automatically merge
|
|
results from both dense (semantic) and sparse (BM25 keyword) searches.
|
|
This provides the best of both worlds: semantic understanding for conceptual
|
|
queries and precise keyword matching for specific terms, acronyms, and codes.
|
|
|
|
The fusion happens efficiently in the database using the prefetch mechanism,
|
|
eliminating the need for application-layer result merging.
|
|
"""
|
|
|
|
def __init__(self, score_threshold: float = 0.0):
|
|
"""
|
|
Initialize BM25 hybrid search algorithm.
|
|
|
|
Args:
|
|
score_threshold: Minimum RRF score (0-1, default: 0.0 to allow RRF scoring)
|
|
Note: RRF produces normalized scores, so threshold is typically lower
|
|
"""
|
|
self.score_threshold = score_threshold
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
return "bm25_hybrid"
|
|
|
|
@property
|
|
def requires_vector_db(self) -> bool:
|
|
return True
|
|
|
|
async def search(
|
|
self,
|
|
query: str,
|
|
user_id: str,
|
|
limit: int = 10,
|
|
doc_type: str | None = None,
|
|
**kwargs: Any,
|
|
) -> list[SearchResult]:
|
|
"""
|
|
Execute hybrid search using dense + sparse vectors with native RRF fusion.
|
|
|
|
Returns unverified results from Qdrant. Access verification should be
|
|
performed separately at the final output stage using verify_search_results().
|
|
|
|
Args:
|
|
query: Natural language or keyword search query
|
|
user_id: User ID for filtering
|
|
limit: Maximum results to return
|
|
doc_type: Optional document type filter
|
|
**kwargs: Additional parameters (score_threshold override)
|
|
|
|
Returns:
|
|
List of unverified SearchResult objects ranked by RRF fusion score
|
|
|
|
Raises:
|
|
McpError: If vector sync is not enabled or search fails
|
|
"""
|
|
settings = get_settings()
|
|
score_threshold = kwargs.get("score_threshold", self.score_threshold)
|
|
|
|
logger.info(
|
|
f"BM25 hybrid search: query='{query}', user={user_id}, "
|
|
f"limit={limit}, score_threshold={score_threshold}, doc_type={doc_type}"
|
|
)
|
|
|
|
# Generate dense embedding for semantic search
|
|
embedding_service = get_embedding_service()
|
|
dense_embedding = await embedding_service.embed(query)
|
|
logger.debug(f"Generated dense embedding (dimension={len(dense_embedding)})")
|
|
|
|
# Generate sparse embedding for BM25 keyword search
|
|
bm25_service = get_bm25_service()
|
|
sparse_embedding = bm25_service.encode(query)
|
|
logger.debug(
|
|
f"Generated sparse embedding "
|
|
f"({len(sparse_embedding['indices'])} non-zero terms)"
|
|
)
|
|
|
|
# Build Qdrant filter
|
|
filter_conditions = [
|
|
FieldCondition(
|
|
key="user_id",
|
|
match=MatchValue(value=user_id),
|
|
)
|
|
]
|
|
|
|
# Add doc_type filter if specified
|
|
if doc_type:
|
|
filter_conditions.append(
|
|
FieldCondition(
|
|
key="doc_type",
|
|
match=MatchValue(value=doc_type),
|
|
)
|
|
)
|
|
|
|
query_filter = Filter(must=filter_conditions)
|
|
|
|
# Execute hybrid search with Qdrant native RRF fusion
|
|
qdrant_client = await get_qdrant_client()
|
|
try:
|
|
# Use prefetch to run both dense and sparse searches
|
|
# Qdrant will automatically merge results using RRF
|
|
search_response = await qdrant_client.query_points(
|
|
collection_name=settings.get_collection_name(),
|
|
prefetch=[
|
|
# Dense semantic search
|
|
models.Prefetch(
|
|
query=dense_embedding,
|
|
using="dense",
|
|
limit=limit * 2, # Get extra for deduplication
|
|
filter=query_filter,
|
|
),
|
|
# Sparse BM25 search
|
|
models.Prefetch(
|
|
query=models.SparseVector(
|
|
indices=sparse_embedding["indices"],
|
|
values=sparse_embedding["values"],
|
|
),
|
|
using="sparse",
|
|
limit=limit * 2, # Get extra for deduplication
|
|
filter=query_filter,
|
|
),
|
|
],
|
|
# RRF fusion query (no additional query needed, just fusion)
|
|
query=models.FusionQuery(fusion=models.Fusion.RRF),
|
|
limit=limit * 2, # Get extra for deduplication
|
|
score_threshold=score_threshold,
|
|
with_payload=True,
|
|
with_vectors=False, # Don't return vectors to save bandwidth
|
|
)
|
|
record_qdrant_operation("search", "success")
|
|
except Exception:
|
|
record_qdrant_operation("search", "error")
|
|
raise
|
|
|
|
logger.info(
|
|
f"Qdrant RRF fusion returned {len(search_response.points)} results "
|
|
f"(before deduplication)"
|
|
)
|
|
|
|
if search_response.points:
|
|
# Log top 3 RRF scores to help with threshold tuning
|
|
top_scores = [p.score for p in search_response.points[:3]]
|
|
logger.debug(f"Top 3 RRF fusion scores: {top_scores}")
|
|
|
|
# Deduplicate by (doc_id, doc_type) - multiple chunks per document
|
|
seen_docs = set()
|
|
results = []
|
|
|
|
for result in search_response.points:
|
|
doc_id = int(result.payload["doc_id"])
|
|
doc_type = result.payload.get("doc_type", "note")
|
|
doc_key = (doc_id, doc_type)
|
|
|
|
# Skip if we've already seen this document
|
|
if doc_key in seen_docs:
|
|
continue
|
|
|
|
seen_docs.add(doc_key)
|
|
|
|
# Return unverified results (verification happens at output stage)
|
|
results.append(
|
|
SearchResult(
|
|
id=doc_id,
|
|
doc_type=doc_type,
|
|
title=result.payload.get("title", "Untitled"),
|
|
excerpt=result.payload.get("excerpt", ""),
|
|
score=result.score, # RRF fusion score
|
|
metadata={
|
|
"chunk_index": result.payload.get("chunk_index"),
|
|
"total_chunks": result.payload.get("total_chunks"),
|
|
"search_method": "bm25_hybrid_rrf",
|
|
},
|
|
)
|
|
)
|
|
|
|
if len(results) >= limit:
|
|
break
|
|
|
|
logger.info(f"Returning {len(results)} unverified results after deduplication")
|
|
if results:
|
|
result_details = [
|
|
f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')"
|
|
for r in results[:5] # Show top 5
|
|
]
|
|
logger.debug(f"Top results: {', '.join(result_details)}")
|
|
|
|
return results
|