diff --git a/docker-compose.yml b/docker-compose.yml index 066e56c..9b62183 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -85,18 +85,18 @@ services: # Vector sync configuration (ADR-007) - VECTOR_SYNC_ENABLED=true - - VECTOR_SYNC_SCAN_INTERVAL=3600 - - VECTOR_SYNC_PROCESSOR_WORKERS=3 + - VECTOR_SYNC_SCAN_INTERVAL=10 + - VECTOR_SYNC_PROCESSOR_WORKERS=1 # Qdrant configuration - QDRANT_URL=http://qdrant:6333 - QDRANT_API_KEY=${QDRANT_API_KEY:-my_secret_api_key} - QDRANT_COLLECTION=nextcloud_content - # Ollama configuration - - OLLAMA_BASE_URL=https://ollama.internal.coutinho.io:443 - - OLLAMA_EMBEDDING_MODEL=nomic-embed-text - - OLLAMA_VERIFY_SSL=true + # Ollama configuration (optional - uses SimpleEmbeddingProvider if not set) + # - OLLAMA_BASE_URL=http://your-ollama-endpoint:port + # - OLLAMA_EMBEDDING_MODEL=nomic-embed-text + # - OLLAMA_VERIFY_SSL=false mcp-oauth: build: . @@ -211,7 +211,7 @@ services: environment: - QDRANT__SERVICE__API_KEY=${QDRANT_API_KEY:-my_secret_api_key} healthcheck: - test: ["CMD-SHELL", "curl -f http://localhost:6333/readyz || exit 1"] + test: ["CMD-SHELL", "test -f /qdrant/.qdrant-initialized"] interval: 10s timeout: 5s retries: 10 diff --git a/nextcloud_mcp_server/embedding/__init__.py b/nextcloud_mcp_server/embedding/__init__.py index 3b06aba..37fae36 100644 --- a/nextcloud_mcp_server/embedding/__init__.py +++ b/nextcloud_mcp_server/embedding/__init__.py @@ -1,5 +1,6 @@ """Embedding service package for generating vector embeddings.""" from .service import EmbeddingService, get_embedding_service +from .simple_provider import SimpleEmbeddingProvider -__all__ = ["EmbeddingService", "get_embedding_service"] +__all__ = ["EmbeddingService", "get_embedding_service", "SimpleEmbeddingProvider"] diff --git a/nextcloud_mcp_server/embedding/service.py b/nextcloud_mcp_server/embedding/service.py index 758744a..676b349 100644 --- a/nextcloud_mcp_server/embedding/service.py +++ b/nextcloud_mcp_server/embedding/service.py @@ -5,6 +5,7 @@ import os from .base import EmbeddingProvider from .ollama_provider import OllamaEmbeddingProvider +from .simple_provider import SimpleEmbeddingProvider logger = logging.getLogger(__name__) @@ -21,27 +22,35 @@ class EmbeddingService: Auto-detect available embedding provider. Checks environment variables in order: - 1. OLLAMA_BASE_URL - Use Ollama provider + 1. OLLAMA_BASE_URL - Use Ollama provider (production) + 2. OPENAI_API_KEY - Use OpenAI provider (future) + 3. Fallback to SimpleEmbeddingProvider (testing/development) Returns: Configured embedding provider - - Raises: - ValueError: If no embedding provider is configured """ - # Ollama provider (for this deployment) + # Ollama provider (production) ollama_url = os.getenv("OLLAMA_BASE_URL") if ollama_url: + logger.info(f"Using Ollama embedding provider: {ollama_url}") return OllamaEmbeddingProvider( base_url=ollama_url, model=os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text"), verify_ssl=os.getenv("OLLAMA_VERIFY_SSL", "true").lower() == "true", ) - raise ValueError( - "No embedding provider configured. " - "Set OLLAMA_BASE_URL environment variable." + # OpenAI provider (future implementation) + # openai_key = os.getenv("OPENAI_API_KEY") + # if openai_key: + # return OpenAIEmbeddingProvider(api_key=openai_key) + + # Fallback to simple provider for development/testing + logger.warning( + "No embedding provider configured (OLLAMA_BASE_URL or OPENAI_API_KEY not set). " + "Using SimpleEmbeddingProvider for testing/development. " + "For production, configure an external embedding service." ) + return SimpleEmbeddingProvider(dimension=384) async def embed(self, text: str) -> list[float]: """ diff --git a/nextcloud_mcp_server/embedding/simple_provider.py b/nextcloud_mcp_server/embedding/simple_provider.py new file mode 100644 index 0000000..6002c7d --- /dev/null +++ b/nextcloud_mcp_server/embedding/simple_provider.py @@ -0,0 +1,123 @@ +"""Simple in-process embedding provider for testing. + +This provider uses a basic TF-IDF-like approach with feature hashing to generate +deterministic embeddings without requiring external services. Suitable for testing +but not for production use. +""" + +import hashlib +import math +import re +from collections import Counter + +from .base import EmbeddingProvider + + +class SimpleEmbeddingProvider(EmbeddingProvider): + """Simple deterministic embedding provider using feature hashing. + + This implementation: + - Tokenizes text into words + - Uses feature hashing to map words to fixed-size vectors + - Applies TF-IDF-like weighting + - Normalizes vectors to unit length + + Not suitable for production but good for testing semantic search infrastructure. + """ + + def __init__(self, dimension: int = 384): + """Initialize simple embedding provider. + + Args: + dimension: Embedding dimension (default: 384) + """ + self.dimension = dimension + + def _tokenize(self, text: str) -> list[str]: + """Tokenize text into lowercase words. + + Args: + text: Input text + + Returns: + List of lowercase word tokens + """ + # Simple word tokenization + text = text.lower() + words = re.findall(r"\b\w+\b", text) + return words + + def _hash_word(self, word: str) -> int: + """Hash word to dimension index. + + Args: + word: Word to hash + + Returns: + Index in range [0, dimension) + """ + hash_bytes = hashlib.md5(word.encode()).digest() + hash_int = int.from_bytes(hash_bytes[:4], byteorder="big") + return hash_int % self.dimension + + def _embed_single(self, text: str) -> list[float]: + """Generate embedding for single text. + + Args: + text: Input text + + Returns: + Normalized embedding vector + """ + tokens = self._tokenize(text) + if not tokens: + return [0.0] * self.dimension + + # Count term frequencies + term_freq = Counter(tokens) + + # Initialize vector + vector = [0.0] * self.dimension + + # Apply TF weighting with feature hashing + for word, count in term_freq.items(): + idx = self._hash_word(word) + # Simple TF weighting: log(1 + count) + vector[idx] += math.log1p(count) + + # Normalize to unit length + norm = math.sqrt(sum(x * x for x in vector)) + if norm > 0: + vector = [x / norm for x in vector] + + return vector + + async def embed(self, text: str) -> list[float]: + """Generate embedding vector for text. + + Args: + text: Input text to embed + + Returns: + Vector embedding as list of floats + """ + return self._embed_single(text) + + async def embed_batch(self, texts: list[str]) -> list[list[float]]: + """Generate embeddings for multiple texts. + + Args: + texts: List of texts to embed + + Returns: + List of vector embeddings + """ + return [self._embed_single(text) for text in texts] + + def get_dimension(self) -> int: + """Get embedding dimension. + + Returns: + Vector dimension + """ + return self.dimension diff --git a/pyproject.toml b/pyproject.toml index a0da862..edd2014 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,8 @@ dependencies = [ "pyjwt[crypto]>=2.8.0", "aiosqlite>=0.20.0", # Async SQLite for refresh token storage "authlib>=1.6.5", - "qdrant-client>=1.7.0", # Vector database for semantic search + "qdrant-client>=1.7.0", # Vector database for semantic search + "urllib3<2.0", ] classifiers = [ "Development Status :: 4 - Beta", diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/integration/test_semantic_search.py b/tests/integration/test_semantic_search.py new file mode 100644 index 0000000..09f9d5e --- /dev/null +++ b/tests/integration/test_semantic_search.py @@ -0,0 +1,344 @@ +"""Integration tests for semantic search with vector database. + +These tests validate the complete semantic search flow: +1. Initialize Qdrant collection with simple in-process embeddings +2. Index sample notes into vector database +3. Perform semantic search queries +4. Verify relevant results are returned + +Uses SimpleEmbeddingProvider for deterministic, in-process embeddings +without requiring external services like Ollama. +""" + +import pytest +from qdrant_client import AsyncQdrantClient +from qdrant_client.models import Distance, PointStruct, VectorParams + +from nextcloud_mcp_server.embedding import SimpleEmbeddingProvider + +pytestmark = pytest.mark.integration + + +@pytest.fixture +async def simple_embedding_provider(): + """Simple in-process embedding provider for testing.""" + return SimpleEmbeddingProvider(dimension=384) + + +@pytest.fixture +async def qdrant_test_client(): + """Qdrant client for testing (in-memory).""" + client = AsyncQdrantClient(":memory:") + yield client + await client.close() + + +@pytest.fixture +async def test_collection(qdrant_test_client: AsyncQdrantClient): + """Create test collection in Qdrant.""" + collection_name = "test_semantic_search" + + # Create collection + await qdrant_test_client.create_collection( + collection_name=collection_name, + vectors_config=VectorParams(size=384, distance=Distance.COSINE), + ) + + yield collection_name + + # Cleanup + try: + await qdrant_test_client.delete_collection(collection_name) + except Exception: + pass + + +@pytest.fixture +def sample_notes(): + """Sample notes for testing semantic search.""" + return [ + { + "id": 1, + "title": "Python Async Programming", + "content": """# Python Async/Await Patterns + +## Key Concepts +- Use async def for coroutines +- Use await for async operations +- asyncio.gather() for parallel execution + +## Best Practices +Always use async context managers for resources. +Avoid blocking operations in async code.""", + "category": "Development", + }, + { + "id": 2, + "title": "Book Recommendations 2025", + "content": """# Books to Read + +## Fiction +- The Midnight Library by Matt Haig +- Project Hail Mary by Andy Weir + +## Non-Fiction +- Atomic Habits by James Clear +- Deep Work by Cal Newport + +## Technical +- Designing Data-Intensive Applications by Martin Kleppmann""", + "category": "Personal", + }, + { + "id": 3, + "title": "Chocolate Chip Cookie Recipe", + "content": """# Classic Cookies + +## Ingredients +- 2 cups flour +- 1 cup butter +- 1 cup sugar +- 2 eggs +- 2 cups chocolate chips + +## Instructions +1. Preheat oven to 375°F +2. Mix butter and sugar +3. Add eggs and vanilla +4. Mix in flour +5. Fold in chocolate chips +6. Bake 10-12 minutes""", + "category": "Recipes", + }, + { + "id": 4, + "title": "Team Meeting Notes", + "content": """# Q1 Planning Meeting + +## Attendees +- Alice, Bob, Charlie + +## Discussion +- Review Q4 deliverables +- Plan Q1 sprints +- Resource allocation + +## Action Items +- Alice: Draft timeline +- Bob: Infrastructure review""", + "category": "Work", + }, + ] + + +async def test_simple_embedding_provider_deterministic(simple_embedding_provider): + """Test that SimpleEmbeddingProvider generates deterministic embeddings.""" + text = "Hello world this is a test" + + # Generate embedding twice + embedding1 = await simple_embedding_provider.embed(text) + embedding2 = await simple_embedding_provider.embed(text) + + # Should be identical + assert embedding1 == embedding2 + assert len(embedding1) == 384 + + # Should be normalized (unit length) + import math + + norm = math.sqrt(sum(x * x for x in embedding1)) + assert abs(norm - 1.0) < 1e-6 + + +async def test_simple_embedding_provider_similarity(simple_embedding_provider): + """Test that similar texts have higher cosine similarity.""" + + async def cosine_similarity(text1: str, text2: str) -> float: + emb1 = await simple_embedding_provider.embed(text1) + emb2 = await simple_embedding_provider.embed(text2) + return sum(a * b for a, b in zip(emb1, emb2)) + + # Similar texts + python_text1 = "Python async programming with asyncio" + python_text2 = "Using async and await in Python" + unrelated_text = "Chocolate chip cookie recipe" + + # Similar texts should have higher similarity + similar_score = await cosine_similarity(python_text1, python_text2) + unrelated_score = await cosine_similarity(python_text1, unrelated_text) + + assert similar_score > unrelated_score + assert similar_score > 0.3 # Some semantic overlap + assert unrelated_score < similar_score + + +async def test_semantic_search_with_qdrant( + qdrant_test_client: AsyncQdrantClient, + test_collection: str, + simple_embedding_provider: SimpleEmbeddingProvider, + sample_notes: list[dict], +): + """Test full semantic search flow with Qdrant.""" + + # Index all sample notes + points = [] + for note in sample_notes: + content = f"{note['title']}\n\n{note['content']}" + embedding = await simple_embedding_provider.embed(content) + + points.append( + PointStruct( + id=note["id"], # Use integer ID for in-memory Qdrant + vector=embedding, + payload={ + "note_id": note["id"], + "title": note["title"], + "category": note["category"], + "excerpt": content[:200], + }, + ) + ) + + await qdrant_test_client.upsert( + collection_name=test_collection, points=points, wait=True + ) + + # Test Query 1: Search for Python programming + query = "async programming patterns in Python" + query_embedding = await simple_embedding_provider.embed(query) + + results = await qdrant_test_client.search( + collection_name=test_collection, + query_vector=query_embedding, + limit=3, + score_threshold=0.0, + ) + + # Should find Python note as top result + assert len(results) > 0 + assert results[0].payload["note_id"] == 1 + assert "Python" in results[0].payload["title"] + + # Test Query 2: Search for books + query = "good books to read recommendations" + query_embedding = await simple_embedding_provider.embed(query) + + results = await qdrant_test_client.search( + collection_name=test_collection, + query_vector=query_embedding, + limit=3, + score_threshold=0.0, + ) + + # Should find book recommendations note + assert len(results) > 0 + top_result = results[0] + assert top_result.payload["note_id"] == 2 + assert "Book" in top_result.payload["title"] + + # Test Query 3: Search for recipes + query = "how to bake cookies dessert" + query_embedding = await simple_embedding_provider.embed(query) + + results = await qdrant_test_client.search( + collection_name=test_collection, + query_vector=query_embedding, + limit=3, + score_threshold=0.0, + ) + + # Should find recipe note + assert len(results) > 0 + # Recipe should be in top 2 results + top_note_ids = [r.payload["note_id"] for r in results[:2]] + assert 3 in top_note_ids + + +async def test_semantic_search_with_filters( + qdrant_test_client: AsyncQdrantClient, + test_collection: str, + simple_embedding_provider: SimpleEmbeddingProvider, + sample_notes: list[dict], +): + """Test semantic search with category filtering.""" + from qdrant_client.models import FieldCondition, Filter, MatchValue + + # Index notes + points = [] + for note in sample_notes: + content = f"{note['title']}\n\n{note['content']}" + embedding = await simple_embedding_provider.embed(content) + + points.append( + PointStruct( + id=note["id"], # Use integer ID for in-memory Qdrant + vector=embedding, + payload={ + "note_id": note["id"], + "title": note["title"], + "category": note["category"], + }, + ) + ) + + await qdrant_test_client.upsert( + collection_name=test_collection, points=points, wait=True + ) + + # Search only in "Personal" category + query = "books reading" + query_embedding = await simple_embedding_provider.embed(query) + + results = await qdrant_test_client.search( + collection_name=test_collection, + query_vector=query_embedding, + query_filter=Filter( + must=[FieldCondition(key="category", match=MatchValue(value="Personal"))] + ), + limit=3, + ) + + # Should only return Personal category notes + assert len(results) > 0 + for result in results: + assert result.payload["category"] == "Personal" + + +async def test_semantic_search_empty_results( + qdrant_test_client: AsyncQdrantClient, + test_collection: str, + simple_embedding_provider: SimpleEmbeddingProvider, +): + """Test semantic search with no indexed content returns empty results.""" + + query = "test query" + query_embedding = await simple_embedding_provider.embed(query) + + results = await qdrant_test_client.search( + collection_name=test_collection, + query_vector=query_embedding, + limit=10, + ) + + assert len(results) == 0 + + +async def test_batch_embedding(simple_embedding_provider: SimpleEmbeddingProvider): + """Test batch embedding generation.""" + texts = [ + "First document about Python", + "Second document about JavaScript", + "Third document about TypeScript", + ] + + embeddings = await simple_embedding_provider.embed_batch(texts) + + assert len(embeddings) == 3 + assert all(len(emb) == 384 for emb in embeddings) + + # Each should be normalized + import math + + for emb in embeddings: + norm = math.sqrt(sum(x * x for x in emb)) + assert abs(norm - 1.0) < 1e-6 diff --git a/uv.lock b/uv.lock index 0f94096..a3a1487 100644 --- a/uv.lock +++ b/uv.lock @@ -1041,6 +1041,7 @@ dependencies = [ { name = "pyjwt", extra = ["crypto"] }, { name = "pythonvcard4" }, { name = "qdrant-client" }, + { name = "urllib3" }, ] [package.dev-dependencies] @@ -1072,6 +1073,7 @@ requires-dist = [ { name = "pyjwt", extras = ["crypto"], specifier = ">=2.8.0" }, { name = "pythonvcard4", specifier = ">=0.2.0" }, { name = "qdrant-client", specifier = ">=1.7.0" }, + { name = "urllib3", specifier = "<2.0" }, ] [package.metadata.requires-dev] @@ -2216,11 +2218,11 @@ wheels = [ [[package]] name = "urllib3" -version = "2.5.0" +version = "1.26.20" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" } +sdist = { url = "https://files.pythonhosted.org/packages/e4/e8/6ff5e6bc22095cfc59b6ea711b687e2b7ed4bdb373f7eeec370a97d7392f/urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32", size = 307380, upload-time = "2024-08-29T15:43:11.37Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" }, + { url = "https://files.pythonhosted.org/packages/33/cf/8435d5a7159e2a9c83a95896ed596f68cf798005fe107cc655b5c5c14704/urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e", size = 144225, upload-time = "2024-08-29T15:43:08.921Z" }, ] [[package]]