feat: implement vector sync scanner and processor (ADR-007 Phase 2)

Implements background vector database synchronization using anyio TaskGroups for BasicAuth mode with single-user credentials. Scanner Implementation: - Periodic document discovery (hourly, configurable) - Timestamp-based change detection (Nextcloud vs Qdrant) - Wake event for immediate scanning on-demand - Supports both initial sync (all docs) and incremental sync (changes only) - Detects deleted documents and queues for removal Processor Implementation: - Concurrent document processing pool (3 workers default) - I/O-bound embedding generation via Ollama API - Retry logic with exponential backoff (3 retries) - Document chunking (512 words, 50-word overlap) - Handles both index and delete operations - Upserts vectors to Qdrant with rich metadata App Lifespan Integration: - Extended AppContext with background task state - Modified app_lifespan_basic() to start tasks via anyio TaskGroups - Graceful shutdown with coordinated task cancellation - Only activates when VECTOR_SYNC_ENABLED=true Embedding Service: - OllamaEmbeddingProvider with TLS support - Singleton pattern for shared client instances - Batch embedding support for efficiency - Auto-detects embedding dimension (768 for nomic-embed-text) Qdrant Client: - Async client wrapper with singleton pattern - Auto-creates collection on first use - COSINE distance metric for semantic similarity - Integrates with embedding service for dimension detection Health Check Enhancement: - Added Qdrant status check to /health/ready endpoint - Only checks when VECTOR_SYNC_ENABLED=true - 2-second timeout for health probe - Reports connection errors with details Configuration: - VECTOR_SYNC_ENABLED: Enable background sync - VECTOR_SYNC_SCAN_INTERVAL: Scanner frequency (3600s default) - VECTOR_SYNC_PROCESSOR_WORKERS: Concurrent processors (3 default) - QDRANT_URL, QDRANT_API_KEY, QDRANT_COLLECTION: Vector DB config - OLLAMA_BASE_URL, OLLAMA_EMBEDDING_MODEL: Embedding service config Dependencies Added: - qdrant-client>=1.7.0: Vector database client Docker Compose: - Added Qdrant service with health check - Exposed ports 6333 (REST) and 6334 (gRPC) - Configured MCP service with vector sync environment - Added qdrant-data volume for persistence Known Issue: - FastMCP lifespan not triggering for streamable-http transport - Background tasks will start once lifespan integration is complete - Lifespan triggers on MCP session establishment, not server startup Related: ADR-007 Background Vector Database Synchronization 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 21:14:38 +01:00
parent dc93da2ea0
commit 8f45e996e8
16 changed files with 1122 additions and 11 deletions
@@ -0,0 +1,16 @@
+"""Vector database and background sync package."""
+
+from .document_chunker import DocumentChunker
+from .processor import process_document, processor_task
+from .qdrant_client import get_qdrant_client
+from .scanner import DocumentTask, scan_user_documents, scanner_task
+
+__all__ = [
+    "get_qdrant_client",
+    "DocumentChunker",
+    "scanner_task",
+    "scan_user_documents",
+    "DocumentTask",
+    "processor_task",
+    "process_document",
+]
@@ -0,0 +1,51 @@
+"""Document chunking for large texts."""
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentChunker:
+    """Chunk large documents for optimal embedding."""
+
+    def __init__(self, chunk_size: int = 512, overlap: int = 50):
+        """
+        Initialize document chunker.
+
+        Args:
+            chunk_size: Number of words per chunk (default: 512)
+            overlap: Number of overlapping words between chunks (default: 50)
+        """
+        self.chunk_size = chunk_size
+        self.overlap = overlap
+
+    def chunk_text(self, content: str) -> list[str]:
+        """
+        Split text into overlapping chunks.
+
+        Uses simple word-based chunking with configurable overlap to preserve
+        context across chunk boundaries.
+
+        Args:
+            content: Text content to chunk
+
+        Returns:
+            List of text chunks (may be single item if content is small)
+        """
+        # Simple word-based chunking
+        words = content.split()
+
+        if len(words) <= self.chunk_size:
+            return [content]
+
+        chunks = []
+        start = 0
+
+        while start < len(words):
+            end = start + self.chunk_size
+            chunk_words = words[start:end]
+            chunks.append(" ".join(chunk_words))
+            start = end - self.overlap
+
+        logger.debug(f"Chunked document into {len(chunks)} chunks ({len(words)} words)")
+        return chunks
@@ -0,0 +1,219 @@
+"""Processor task for vector database synchronization.
+
+Processes documents from queue: fetches content, generates embeddings, stores in Qdrant.
+"""
+
+import asyncio
+import logging
+import time
+
+import anyio
+from httpx import HTTPStatusError
+from qdrant_client.models import FieldCondition, Filter, MatchValue, PointStruct
+
+from nextcloud_mcp_server.client import NextcloudClient
+from nextcloud_mcp_server.config import get_settings
+from nextcloud_mcp_server.embedding import get_embedding_service
+from nextcloud_mcp_server.vector.document_chunker import DocumentChunker
+from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
+from nextcloud_mcp_server.vector.scanner import DocumentTask
+
+logger = logging.getLogger(__name__)
+
+
+async def processor_task(
+    worker_id: int,
+    document_queue: asyncio.Queue,
+    shutdown_event: anyio.Event,
+    nc_client: NextcloudClient,
+    user_id: str,
+):
+    """
+    Process documents from queue concurrently.
+
+    Each processor task runs in a loop:
+    1. Pull document from queue (with timeout)
+    2. Fetch content from Nextcloud
+    3. Tokenize and chunk text
+    4. Generate embeddings (I/O bound - external API)
+    5. Upload vectors to Qdrant
+    6. Mark task complete
+
+    Multiple processors run concurrently for I/O parallelism.
+
+    Args:
+        worker_id: Worker identifier for logging
+        document_queue: Queue to pull documents from
+        shutdown_event: Event signaling shutdown
+        nc_client: Authenticated Nextcloud client
+        user_id: User being processed
+    """
+    logger.info(f"Processor {worker_id} started")
+
+    while not shutdown_event.is_set():
+        try:
+            # Get document with timeout (allows checking shutdown)
+            doc_task = await asyncio.wait_for(
+                document_queue.get(),
+                timeout=1.0,
+            )
+
+            # Process document
+            await process_document(doc_task, nc_client)
+
+            # Mark complete
+            document_queue.task_done()
+
+        except asyncio.TimeoutError:
+            # No documents available, continue
+            continue
+
+        except Exception as e:
+            logger.error(
+                f"Processor {worker_id} error processing "
+                f"{doc_task.doc_type}_{doc_task.doc_id}: {e}",
+                exc_info=True,
+            )
+            # Mark task done even on error to prevent queue blocking
+            try:
+                document_queue.task_done()
+            except ValueError:
+                pass
+
+    logger.info(f"Processor {worker_id} stopped")
+
+
+async def process_document(doc_task: DocumentTask, nc_client: NextcloudClient):
+    """
+    Process a single document: fetch, tokenize, embed, store in Qdrant.
+
+    Implements retry logic with exponential backoff for transient failures.
+
+    Args:
+        doc_task: Document task to process
+        nc_client: Authenticated Nextcloud client
+    """
+    logger.debug(
+        f"Processing {doc_task.doc_type}_{doc_task.doc_id} "
+        f"for {doc_task.user_id} ({doc_task.operation})"
+    )
+
+    qdrant_client = await get_qdrant_client()
+    settings = get_settings()
+
+    # Handle deletion
+    if doc_task.operation == "delete":
+        await qdrant_client.delete(
+            collection_name=settings.qdrant_collection,
+            points_selector=Filter(
+                must=[
+                    FieldCondition(
+                        key="user_id",
+                        match=MatchValue(value=doc_task.user_id),
+                    ),
+                    FieldCondition(
+                        key="doc_id",
+                        match=MatchValue(value=doc_task.doc_id),
+                    ),
+                    FieldCondition(
+                        key="doc_type",
+                        match=MatchValue(value=doc_task.doc_type),
+                    ),
+                ]
+            ),
+        )
+        logger.info(
+            f"Deleted {doc_task.doc_type}_{doc_task.doc_id} for {doc_task.user_id}"
+        )
+        return
+
+    # Handle indexing with retry
+    max_retries = 3
+    retry_delay = 1.0
+
+    for attempt in range(max_retries):
+        try:
+            await _index_document(doc_task, nc_client, qdrant_client)
+            return  # Success
+
+        except (HTTPStatusError, Exception) as e:
+            if attempt < max_retries - 1:
+                logger.warning(
+                    f"Retry {attempt + 1}/{max_retries} for "
+                    f"{doc_task.doc_type}_{doc_task.doc_id}: {e}"
+                )
+                await anyio.sleep(retry_delay)
+                retry_delay *= 2  # Exponential backoff
+            else:
+                logger.error(
+                    f"Failed to index {doc_task.doc_type}_{doc_task.doc_id} "
+                    f"after {max_retries} retries: {e}"
+                )
+                raise
+
+
+async def _index_document(
+    doc_task: DocumentTask, nc_client: NextcloudClient, qdrant_client
+):
+    """
+    Index a single document (called by process_document with retry).
+
+    Args:
+        doc_task: Document task to index
+        nc_client: Authenticated Nextcloud client
+        qdrant_client: Qdrant client instance
+    """
+    settings = get_settings()
+
+    # Fetch document content
+    if doc_task.doc_type == "note":
+        document = await nc_client.notes.get_note(int(doc_task.doc_id))
+        content = f"{document['title']}\n\n{document['content']}"
+        title = document["title"]
+        etag = document.get("etag", "")
+    else:
+        raise ValueError(f"Unsupported doc_type: {doc_task.doc_type}")
+
+    # Tokenize and chunk
+    chunker = DocumentChunker(chunk_size=512, overlap=50)
+    chunks = chunker.chunk_text(content)
+
+    # Generate embeddings (I/O bound - external API call)
+    embedding_service = get_embedding_service()
+    embeddings = await embedding_service.embed_batch(chunks)
+
+    # Prepare Qdrant points
+    indexed_at = int(time.time())
+    points = []
+
+    for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
+        points.append(
+            PointStruct(
+                id=f"{doc_task.doc_type}_{doc_task.doc_id}_{i}",
+                vector=embedding,
+                payload={
+                    "user_id": doc_task.user_id,
+                    "doc_id": doc_task.doc_id,
+                    "doc_type": doc_task.doc_type,
+                    "title": title,
+                    "excerpt": chunk[:200],
+                    "indexed_at": indexed_at,
+                    "modified_at": doc_task.modified_at,
+                    "etag": etag,
+                    "chunk_index": i,
+                    "total_chunks": len(chunks),
+                },
+            )
+        )
+
+    # Upsert to Qdrant
+    await qdrant_client.upsert(
+        collection_name=settings.qdrant_collection,
+        points=points,
+        wait=True,
+    )
+
+    logger.info(
+        f"Indexed {doc_task.doc_type}_{doc_task.doc_id} for {doc_task.user_id} "
+        f"({len(chunks)} chunks)"
+    )
@@ -0,0 +1,66 @@
+"""Qdrant client wrapper."""
+
+import logging
+import os
+
+from qdrant_client import AsyncQdrantClient
+from qdrant_client.models import Distance, VectorParams
+
+logger = logging.getLogger(__name__)
+
+
+# Singleton instance
+_qdrant_client: AsyncQdrantClient | None = None
+
+
+async def get_qdrant_client() -> AsyncQdrantClient:
+    """
+    Get singleton Qdrant client instance.
+
+    Automatically creates collection on first use if it doesn't exist.
+
+    Returns:
+        Configured AsyncQdrantClient instance
+
+    Raises:
+        Exception: If Qdrant connection fails or collection creation fails
+    """
+    global _qdrant_client
+
+    if _qdrant_client is None:
+        url = os.getenv("QDRANT_URL", "http://qdrant:6333")
+        api_key = os.getenv("QDRANT_API_KEY")
+
+        _qdrant_client = AsyncQdrantClient(
+            url=url,
+            api_key=api_key,
+            timeout=30,
+        )
+
+        # Ensure collection exists
+        collection_name = os.getenv("QDRANT_COLLECTION", "nextcloud_content")
+
+        # Import here to avoid circular dependency
+        from nextcloud_mcp_server.embedding import get_embedding_service
+
+        embedding_service = get_embedding_service()
+        dimension = embedding_service.get_dimension()
+
+        try:
+            await _qdrant_client.get_collection(collection_name)
+            logger.info(f"Using existing Qdrant collection: {collection_name}")
+        except Exception:
+            # Collection doesn't exist, create it
+            await _qdrant_client.create_collection(
+                collection_name=collection_name,
+                vectors_config=VectorParams(
+                    size=dimension,
+                    distance=Distance.COSINE,
+                ),
+            )
+            logger.info(
+                f"Created Qdrant collection: {collection_name} "
+                f"(dimension={dimension}, distance=COSINE)"
+            )
+
+    return _qdrant_client
@@ -0,0 +1,172 @@
+"""Scanner task for vector database synchronization.
+
+Periodically scans enabled users' content and queues changed documents for processing.
+"""
+
+import asyncio
+import logging
+from dataclasses import dataclass
+
+import anyio
+from qdrant_client.models import FieldCondition, Filter, MatchValue
+
+from nextcloud_mcp_server.client import NextcloudClient
+from nextcloud_mcp_server.config import get_settings
+from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class DocumentTask:
+    """Document task for processing queue."""
+
+    user_id: str
+    doc_id: str
+    doc_type: str  # "note", "file", "calendar"
+    operation: str  # "index" or "delete"
+    modified_at: int
+
+
+async def scanner_task(
+    document_queue: asyncio.Queue,
+    shutdown_event: anyio.Event,
+    wake_event: anyio.Event,
+    nc_client: NextcloudClient,
+    user_id: str,
+):
+    """
+    Periodic scanner that detects changed documents for enabled user.
+
+    For BasicAuth mode, scans a single user with credentials available at runtime.
+
+    Args:
+        document_queue: Queue to enqueue changed documents
+        shutdown_event: Event signaling shutdown
+        wake_event: Event to trigger immediate scan
+        nc_client: Authenticated Nextcloud client
+        user_id: User to scan
+    """
+    logger.info(f"Scanner task started for user: {user_id}")
+    settings = get_settings()
+
+    while not shutdown_event.is_set():
+        try:
+            # Scan user documents
+            await scan_user_documents(
+                user_id=user_id,
+                document_queue=document_queue,
+                nc_client=nc_client,
+            )
+
+        except Exception as e:
+            logger.error(f"Scanner error: {e}", exc_info=True)
+
+        # Sleep until next interval or wake event
+        try:
+            with anyio.move_on_after(settings.vector_sync_scan_interval):
+                # Wait for wake event or shutdown (whichever comes first)
+                await wake_event.wait()
+        except anyio.get_cancelled_exc_class():
+            # Shutdown, exit loop
+            break
+
+    logger.info("Scanner task stopped")
+
+
+async def scan_user_documents(
+    user_id: str,
+    document_queue: asyncio.Queue,
+    nc_client: NextcloudClient,
+    initial_sync: bool = False,
+):
+    """
+    Scan a single user's documents and queue changes.
+
+    Args:
+        user_id: User to scan
+        document_queue: Queue to enqueue changed documents
+        nc_client: Authenticated Nextcloud client
+        initial_sync: If True, queue all documents (first-time sync)
+    """
+    logger.info(f"Scanning documents for user: {user_id}")
+
+    # Fetch all notes from Nextcloud
+    notes = await nc_client.notes.list_notes()
+    logger.debug(f"Found {len(notes)} notes for {user_id}")
+
+    if initial_sync:
+        # Queue everything on first sync
+        for note in notes:
+            await document_queue.put(
+                DocumentTask(
+                    user_id=user_id,
+                    doc_id=str(note["id"]),
+                    doc_type="note",
+                    operation="index",
+                    modified_at=note["modified"],
+                )
+            )
+        logger.info(f"Queued {len(notes)} documents for initial sync: {user_id}")
+        return
+
+    # Get indexed state from Qdrant
+    qdrant_client = await get_qdrant_client()
+    scroll_result = await qdrant_client.scroll(
+        collection_name=get_settings().qdrant_collection,
+        scroll_filter=Filter(
+            must=[
+                FieldCondition(key="user_id", match=MatchValue(value=user_id)),
+                FieldCondition(key="doc_type", match=MatchValue(value="note")),
+            ]
+        ),
+        with_payload=["doc_id", "indexed_at"],
+        with_vectors=False,
+        limit=10000,
+    )
+
+    indexed_docs = {
+        point.payload["doc_id"]: point.payload["indexed_at"]
+        for point in scroll_result[0]
+    }
+
+    logger.debug(f"Found {len(indexed_docs)} indexed documents in Qdrant")
+
+    # Compare and queue changes
+    queued = 0
+    for note in notes:
+        doc_id = str(note["id"])
+        indexed_at = indexed_docs.get(doc_id)
+
+        # Queue if never indexed or modified since last index
+        if indexed_at is None or note["modified"] > indexed_at:
+            await document_queue.put(
+                DocumentTask(
+                    user_id=user_id,
+                    doc_id=doc_id,
+                    doc_type="note",
+                    operation="index",
+                    modified_at=note["modified"],
+                )
+            )
+            queued += 1
+
+    # Check for deleted documents (in Qdrant but not in Nextcloud)
+    nextcloud_doc_ids = {str(note["id"]) for note in notes}
+    for doc_id in indexed_docs:
+        if doc_id not in nextcloud_doc_ids:
+            await document_queue.put(
+                DocumentTask(
+                    user_id=user_id,
+                    doc_id=doc_id,
+                    doc_type="note",
+                    operation="delete",
+                    modified_at=0,
+                )
+            )
+            queued += 1
+
+    if queued > 0:
+        logger.info(f"Queued {queued} documents for incremental sync: {user_id}")
+    else:
+        logger.debug(f"No changes detected for {user_id}")