7b8c3f93a8
Adds comprehensive integration tests for vector database semantic search that work without external dependencies (Ollama), making them suitable for CI/CD. Changes: - Add SimpleEmbeddingProvider: in-process TF-IDF-like embeddings using feature hashing - Make Ollama optional: embedding service now falls back to SimpleEmbeddingProvider - Add 6 integration tests covering semantic search, filtering, and batch operations - Downgrade urllib3 to 1.26.x for qdrant-client compatibility - Update docker-compose.yml to comment out Ollama configuration (optional) The SimpleEmbeddingProvider generates deterministic, normalized embeddings suitable for testing semantic similarity without requiring external services. Tests validate that similar texts have higher cosine similarity and that semantic search correctly ranks results by relevance. Test coverage: - Deterministic embedding generation - Semantic similarity between texts - Full search flow with Qdrant (in-memory) - Category filtering - Empty result handling - Batch embedding generation All tests pass and can run in GitHub CI without Ollama infrastructure. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
124 lines
3.2 KiB
Python
124 lines
3.2 KiB
Python
"""Simple in-process embedding provider for testing.
|
|
|
|
This provider uses a basic TF-IDF-like approach with feature hashing to generate
|
|
deterministic embeddings without requiring external services. Suitable for testing
|
|
but not for production use.
|
|
"""
|
|
|
|
import hashlib
|
|
import math
|
|
import re
|
|
from collections import Counter
|
|
|
|
from .base import EmbeddingProvider
|
|
|
|
|
|
class SimpleEmbeddingProvider(EmbeddingProvider):
|
|
"""Simple deterministic embedding provider using feature hashing.
|
|
|
|
This implementation:
|
|
- Tokenizes text into words
|
|
- Uses feature hashing to map words to fixed-size vectors
|
|
- Applies TF-IDF-like weighting
|
|
- Normalizes vectors to unit length
|
|
|
|
Not suitable for production but good for testing semantic search infrastructure.
|
|
"""
|
|
|
|
def __init__(self, dimension: int = 384):
|
|
"""Initialize simple embedding provider.
|
|
|
|
Args:
|
|
dimension: Embedding dimension (default: 384)
|
|
"""
|
|
self.dimension = dimension
|
|
|
|
def _tokenize(self, text: str) -> list[str]:
|
|
"""Tokenize text into lowercase words.
|
|
|
|
Args:
|
|
text: Input text
|
|
|
|
Returns:
|
|
List of lowercase word tokens
|
|
"""
|
|
# Simple word tokenization
|
|
text = text.lower()
|
|
words = re.findall(r"\b\w+\b", text)
|
|
return words
|
|
|
|
def _hash_word(self, word: str) -> int:
|
|
"""Hash word to dimension index.
|
|
|
|
Args:
|
|
word: Word to hash
|
|
|
|
Returns:
|
|
Index in range [0, dimension)
|
|
"""
|
|
hash_bytes = hashlib.md5(word.encode()).digest()
|
|
hash_int = int.from_bytes(hash_bytes[:4], byteorder="big")
|
|
return hash_int % self.dimension
|
|
|
|
def _embed_single(self, text: str) -> list[float]:
|
|
"""Generate embedding for single text.
|
|
|
|
Args:
|
|
text: Input text
|
|
|
|
Returns:
|
|
Normalized embedding vector
|
|
"""
|
|
tokens = self._tokenize(text)
|
|
if not tokens:
|
|
return [0.0] * self.dimension
|
|
|
|
# Count term frequencies
|
|
term_freq = Counter(tokens)
|
|
|
|
# Initialize vector
|
|
vector = [0.0] * self.dimension
|
|
|
|
# Apply TF weighting with feature hashing
|
|
for word, count in term_freq.items():
|
|
idx = self._hash_word(word)
|
|
# Simple TF weighting: log(1 + count)
|
|
vector[idx] += math.log1p(count)
|
|
|
|
# Normalize to unit length
|
|
norm = math.sqrt(sum(x * x for x in vector))
|
|
if norm > 0:
|
|
vector = [x / norm for x in vector]
|
|
|
|
return vector
|
|
|
|
async def embed(self, text: str) -> list[float]:
|
|
"""Generate embedding vector for text.
|
|
|
|
Args:
|
|
text: Input text to embed
|
|
|
|
Returns:
|
|
Vector embedding as list of floats
|
|
"""
|
|
return self._embed_single(text)
|
|
|
|
async def embed_batch(self, texts: list[str]) -> list[list[float]]:
|
|
"""Generate embeddings for multiple texts.
|
|
|
|
Args:
|
|
texts: List of texts to embed
|
|
|
|
Returns:
|
|
List of vector embeddings
|
|
"""
|
|
return [self._embed_single(text) for text in texts]
|
|
|
|
def get_dimension(self) -> int:
|
|
"""Get embedding dimension.
|
|
|
|
Returns:
|
|
Vector dimension
|
|
"""
|
|
return self.dimension
|