6fe5596c13
Replace custom keyword/fuzzy search algorithms with industry-standard BM25 sparse vectors, combined with dense semantic vectors using Qdrant's native Reciprocal Rank Fusion (RRF). This consolidates search architecture and improves relevance for both semantic and keyword queries. Key changes: - Add fastembed dependency for BM25 sparse vector generation - Update Qdrant collection schema to support named vectors (dense + sparse) - Create BM25SparseEmbeddingProvider using FastEmbed's Qdrant/bm25 model - Implement BM25HybridSearchAlgorithm with native Qdrant RRF prefetch - Update document processor to generate both dense and sparse embeddings - Simplify nc_semantic_search() tool to use BM25 hybrid only - Remove legacy keyword.py, fuzzy.py, and custom hybrid.py (736 lines) - Update ADR-014 with implementation notes and test results Benefits: - Consolidated architecture (single Qdrant database) - Native database-level RRF fusion (more efficient) - Industry-standard BM25 (replaces brittle custom keyword search) - Better relevance across semantic and keyword queries - Simplified codebase (-285 net lines) Tests: All 125 tests passing (118 unit, 7 integration) Implements ADR-014: Replace Custom Keyword Search with BM25 Hybrid Search 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
75 lines
2.3 KiB
Python
75 lines
2.3 KiB
Python
"""BM25 sparse embedding provider using FastEmbed."""
|
|
|
|
import logging
|
|
from typing import Any
|
|
|
|
from fastembed import SparseTextEmbedding
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BM25SparseEmbeddingProvider:
|
|
"""
|
|
BM25 sparse embedding provider for hybrid search.
|
|
|
|
Uses FastEmbed's BM25 model to generate sparse vectors for keyword-based
|
|
retrieval. These sparse vectors are combined with dense semantic vectors
|
|
in Qdrant using Reciprocal Rank Fusion (RRF) for hybrid search.
|
|
|
|
Unlike dense embeddings which have fixed dimensions, sparse embeddings
|
|
have variable-length vectors with (index, value) pairs representing
|
|
term frequencies in the BM25 vocabulary.
|
|
"""
|
|
|
|
def __init__(self, model_name: str = "Qdrant/bm25"):
|
|
"""
|
|
Initialize BM25 sparse embedding provider.
|
|
|
|
Args:
|
|
model_name: FastEmbed BM25 model name (default: Qdrant/bm25)
|
|
"""
|
|
self.model_name = model_name
|
|
logger.info(f"Initializing BM25 sparse embedding provider: {model_name}")
|
|
|
|
# Initialize FastEmbed sparse embedding model
|
|
self.model = SparseTextEmbedding(model_name=model_name)
|
|
logger.info(f"BM25 sparse embedding model loaded: {model_name}")
|
|
|
|
def encode(self, text: str) -> dict[str, Any]:
|
|
"""
|
|
Generate BM25 sparse embedding for a single text.
|
|
|
|
Args:
|
|
text: Input text to encode
|
|
|
|
Returns:
|
|
Dictionary with 'indices' and 'values' keys for Qdrant sparse vector
|
|
"""
|
|
# FastEmbed returns a generator, take first result
|
|
sparse_embedding = next(iter(self.model.embed([text])))
|
|
|
|
return {
|
|
"indices": sparse_embedding.indices.tolist(),
|
|
"values": sparse_embedding.values.tolist(),
|
|
}
|
|
|
|
def encode_batch(self, texts: list[str]) -> list[dict[str, Any]]:
|
|
"""
|
|
Generate BM25 sparse embeddings for multiple texts (batched).
|
|
|
|
Args:
|
|
texts: List of texts to encode
|
|
|
|
Returns:
|
|
List of dictionaries with 'indices' and 'values' for each text
|
|
"""
|
|
sparse_embeddings = list(self.model.embed(texts))
|
|
|
|
return [
|
|
{
|
|
"indices": emb.indices.tolist(),
|
|
"values": emb.values.tolist(),
|
|
}
|
|
for emb in sparse_embeddings
|
|
]
|