feat: Add multi-document Protocol with cross-app search support

Implements NextcloudClientProtocol for multi-document type search following user requirement that document types are not 1:1 with apps (e.g., Notes app specializes in markdown, while Files/WebDAV handles multiple file types). Key Changes: - NextcloudClientProtocol: Generic protocol with app-specific client properties - get_indexed_doc_types(): Query Qdrant for actually-indexed document types - Document dispatch: All algorithms check Qdrant before attempting access - Cross-type deduplication: Use (doc_id, doc_type) tuples in hybrid RRF Search Algorithm Updates: - Semantic: Added _verify_document_access() with dispatch to appropriate client - Deduplication by (doc_id, doc_type) tuple - Only "note" verification implemented, others return None with info log - Keyword: Added _fetch_documents() dispatch method - Queries Qdrant for available types before fetching - Supports cross-app search when doc_type=None - Fuzzy: Same pattern as keyword search - Hybrid: Already uses (doc_id, doc_type) for deduplication (no changes needed) Future-Proof Design: - File/calendar verification stubs in place - Clear logging when unsupported types found - Easy to extend when processor indexes new document types Currently Supported: - "note" documents fully implemented and tested - Other types gracefully handled (logged but skipped) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-15 01:19:29 +01:00
parent f3bdb8b885
commit b5b03bfd78
6 changed files with 360 additions and 100 deletions
@@ -10,15 +10,22 @@ All algorithms share the same interface and can be used interchangeably by both
 MCP tools and the visualization pane.
 """

-from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
+from nextcloud_mcp_server.search.algorithms import (
+    NextcloudClientProtocol,
+    SearchAlgorithm,
+    SearchResult,
+    get_indexed_doc_types,
+)
 from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm
 from nextcloud_mcp_server.search.hybrid import HybridSearchAlgorithm
 from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm
 from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm

 __all__ = [
+    "NextcloudClientProtocol",
    "SearchAlgorithm",
    "SearchResult",
+    "get_indexed_doc_types",
    "SemanticSearchAlgorithm",
    "KeywordSearchAlgorithm",
    "FuzzySearchAlgorithm",
@@ -2,7 +2,120 @@

 from abc import ABC, abstractmethod
 from dataclasses import dataclass
-from typing import Any
+from typing import Any, Protocol, runtime_checkable
+
+
+@runtime_checkable
+class NextcloudClientProtocol(Protocol):
+    """Protocol for Nextcloud client supporting multi-document search.
+
+    This protocol defines the interface that search algorithms need from a
+    Nextcloud client to access documents across different apps (Notes, Files,
+    Calendar, etc.). The client provides access to app-specific sub-clients
+    that handle the actual API calls.
+
+    Document types (e.g., "note", "file", "calendar") are NOT 1:1 with apps.
+    For example, the Notes app specializes in markdown files, while Files/WebDAV
+    handles multiple file types. The abstraction is at the document type level.
+
+    Search algorithms query Qdrant to determine which document types are actually
+    indexed before attempting to access them, enabling graceful cross-app search.
+    """
+
+    username: str
+
+    # App-specific clients that search algorithms dispatch to
+    @property
+    def notes(self) -> Any:
+        """Notes client for accessing note documents."""
+        ...
+
+    @property
+    def webdav(self) -> Any:
+        """WebDAV client for accessing file documents."""
+        ...
+
+    @property
+    def calendar(self) -> Any:
+        """Calendar client for accessing event/task documents."""
+        ...
+
+    @property
+    def contacts(self) -> Any:
+        """Contacts client for accessing contact card documents."""
+        ...
+
+    @property
+    def deck(self) -> Any:
+        """Deck client for accessing deck card documents."""
+        ...
+
+    @property
+    def cookbook(self) -> Any:
+        """Cookbook client for accessing recipe documents."""
+        ...
+
+    @property
+    def tables(self) -> Any:
+        """Tables client for accessing table row documents."""
+        ...
+
+
+async def get_indexed_doc_types(user_id: str) -> set[str]:
+    """Query Qdrant to get actually-indexed document types for a user.
+
+    This enables search algorithms to check which document types are available
+    before attempting to search/verify them, allowing graceful cross-app search.
+
+    Args:
+        user_id: User ID to filter by
+
+    Returns:
+        Set of document type strings (e.g., {"note", "file", "calendar"})
+
+    Example:
+        >>> types = await get_indexed_doc_types("alice")
+        >>> if "note" in types:
+        ...     # Search notes
+    """
+    import logging
+
+    from qdrant_client.models import FieldCondition, Filter, MatchValue
+
+    from nextcloud_mcp_server.config import get_settings
+    from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
+
+    logger = logging.getLogger(__name__)
+    settings = get_settings()
+
+    qdrant_client = await get_qdrant_client()
+    collection = settings.qdrant_collection
+
+    # Use scroll to sample documents and extract doc_types
+    # Note: This could be optimized with a facet/aggregation query if Qdrant adds support
+    try:
+        scroll_results, _next_offset = await qdrant_client.scroll(
+            collection_name=collection,
+            scroll_filter=Filter(
+                must=[FieldCondition(key="user_id", match=MatchValue(value=user_id))]
+            ),
+            limit=1000,  # Sample size to discover types
+            with_payload=["doc_type"],
+            with_vectors=False,  # Don't need vectors for type discovery
+        )
+
+        doc_types = {
+            point.payload.get("doc_type")
+            for point in scroll_results
+            if point.payload.get("doc_type")
+        }
+
+        logger.debug(f"Found indexed document types for user {user_id}: {doc_types}")
+        return doc_types
+
+    except Exception as e:
+        logger.warning(f"Failed to query Qdrant for doc_types: {e}")
+        return set()


@dataclass
@@ -3,8 +3,12 @@
 import logging
 from typing import Any

-from nextcloud_mcp_server.client import NextcloudClient
-from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
+from nextcloud_mcp_server.search.algorithms import (
+    NextcloudClientProtocol,
+    SearchAlgorithm,
+    SearchResult,
+    get_indexed_doc_types,
+)

 logger = logging.getLogger(__name__)

@@ -38,7 +42,7 @@ class FuzzySearchAlgorithm(SearchAlgorithm):
        user_id: str,
        limit: int = 10,
        doc_type: str | None = None,
-        nextcloud_client: NextcloudClient | None = None,
+        nextcloud_client: NextcloudClientProtocol | None = None,
        **kwargs: Any,
    ) -> list[SearchResult]:
        """Execute fuzzy search using character overlap.
@@ -67,22 +71,39 @@ class FuzzySearchAlgorithm(SearchAlgorithm):
            f"limit={limit}, threshold={threshold}, doc_type={doc_type}"
        )

-        # Currently only supports notes
-        if doc_type and doc_type != "note":
-            logger.warning(f"Fuzzy search not yet implemented for doc_type={doc_type}")
-            return []
+        # Get available document types from Qdrant
+        indexed_types = await get_indexed_doc_types(user_id)
+        logger.debug(f"Indexed document types for user: {indexed_types}")

-        # Fetch all notes for the user
-        notes = await nextcloud_client.notes.get_notes()
-        logger.debug(f"Fetched {len(notes)} notes for fuzzy search")
+        # Determine which types to search
+        if doc_type:
+            # Search specific type if requested
+            search_types = [doc_type] if doc_type in indexed_types else []
+            if not search_types:
+                logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}")
+                return []
+        else:
+            # Search all indexed types
+            search_types = list(indexed_types)

-        # Score and filter notes
-        scored_notes = []
+        # Fetch documents for each type and score them
+        all_documents = []
+        for dtype in search_types:
+            documents = await self._fetch_documents(nextcloud_client, dtype)
+            for doc in documents:
+                doc["_doc_type"] = dtype  # Tag with type
+            all_documents.extend(documents)
+
+        logger.debug(f"Fetched {len(all_documents)} total documents for fuzzy search")
+
+        # Score and filter documents
+        scored_results = []
        query_lower = query.lower()

-        for note in notes:
-            title = note.get("title", "")
-            content = note.get("content", "")
+        for doc in all_documents:
+            dtype = doc.get("_doc_type", "note")
+            title = doc.get("title", "")
+            content = doc.get("content", "")

            # Check title match
            title_score = self._calculate_char_overlap(query_lower, title.lower())
@@ -100,16 +121,16 @@ class FuzzySearchAlgorithm(SearchAlgorithm):
                else:
                    excerpt = self._extract_excerpt(content, max_length=200)

-                scored_notes.append(
+                scored_results.append(
                    SearchResult(
-                        id=note["id"],
-                        doc_type="note",
+                        id=doc["id"],
+                        doc_type=dtype,
                        title=title or "Untitled",
                        excerpt=excerpt,
                        score=best_score,
                        metadata={
-                            "category": note.get("category", ""),
-                            "modified": note.get("modified"),
+                            "category": doc.get("category", ""),
+                            "modified": doc.get("modified"),
                            "match_location": "title"
                            if title_score >= content_score
                            else "content",
@@ -118,8 +139,8 @@ class FuzzySearchAlgorithm(SearchAlgorithm):
                )

        # Sort by score (descending) and limit
-        scored_notes.sort(key=lambda x: x.score, reverse=True)
-        results = scored_notes[:limit]
+        scored_results.sort(key=lambda x: x.score, reverse=True)
+        results = scored_results[:limit]

        logger.info(f"Fuzzy search returned {len(results)} matching notes")
        if results:
@@ -131,6 +152,32 @@ class FuzzySearchAlgorithm(SearchAlgorithm):

        return results

+    async def _fetch_documents(
+        self, nextcloud_client: NextcloudClientProtocol, doc_type: str
+    ) -> list[dict[str, Any]]:
+        """Fetch documents of a specific type from Nextcloud.
+
+        Args:
+            nextcloud_client: Client for API access
+            doc_type: Document type to fetch ("note", "file", "calendar", etc.)
+
+        Returns:
+            List of document dictionaries with at minimum: id, title, content
+        """
+        if doc_type == "note":
+            return await nextcloud_client.notes.get_notes()
+        elif doc_type == "file":
+            # Future: fetch files when indexed
+            logger.info("File documents not yet supported for fuzzy search")
+            return []
+        elif doc_type == "calendar":
+            # Future: fetch calendar events when indexed
+            logger.info("Calendar documents not yet supported for fuzzy search")
+            return []
+        else:
+            logger.warning(f"Unknown document type '{doc_type}' for fuzzy search")
+            return []
+
    def _calculate_char_overlap(self, query: str, text: str) -> float:
        """Calculate character overlap ratio between query and text.

@@ -5,8 +5,11 @@ import logging
 from collections import defaultdict
 from typing import Any

-from nextcloud_mcp_server.client import NextcloudClient
-from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
+from nextcloud_mcp_server.search.algorithms import (
+    NextcloudClientProtocol,
+    SearchAlgorithm,
+    SearchResult,
+)
 from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm
 from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm
 from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm
@@ -82,7 +85,7 @@ class HybridSearchAlgorithm(SearchAlgorithm):
        user_id: str,
        limit: int = 10,
        doc_type: str | None = None,
-        nextcloud_client: NextcloudClient | None = None,
+        nextcloud_client: NextcloudClientProtocol | None = None,
        **kwargs: Any,
    ) -> list[SearchResult]:
        """Execute hybrid search using RRF to combine algorithms.
@@ -3,8 +3,12 @@
 import logging
 from typing import Any

-from nextcloud_mcp_server.client import NextcloudClient
-from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
+from nextcloud_mcp_server.search.algorithms import (
+    NextcloudClientProtocol,
+    SearchAlgorithm,
+    SearchResult,
+    get_indexed_doc_types,
+)

 logger = logging.getLogger(__name__)

@@ -32,7 +36,7 @@ class KeywordSearchAlgorithm(SearchAlgorithm):
        user_id: str,
        limit: int = 10,
        doc_type: str | None = None,
-        nextcloud_client: NextcloudClient | None = None,
+        nextcloud_client: NextcloudClientProtocol | None = None,
        **kwargs: Any,
    ) -> list[SearchResult]:
        """Execute keyword search using token matching.
@@ -63,52 +67,66 @@ class KeywordSearchAlgorithm(SearchAlgorithm):
        query_tokens = self._process_query(query)
        logger.debug(f"Query tokens: {query_tokens}")

-        # Currently only supports notes
-        # TODO: Extend to other document types (files, calendar, etc.)
-        if doc_type and doc_type != "note":
-            logger.warning(
-                f"Keyword search not yet implemented for doc_type={doc_type}"
-            )
-            return []
+        # Get available document types from Qdrant
+        indexed_types = await get_indexed_doc_types(user_id)
+        logger.debug(f"Indexed document types for user: {indexed_types}")

-        # Fetch all notes for the user
-        notes = await nextcloud_client.notes.get_notes()
-        logger.debug(f"Fetched {len(notes)} notes for keyword search")
+        # Determine which types to search
+        if doc_type:
+            # Search specific type if requested
+            search_types = [doc_type] if doc_type in indexed_types else []
+            if not search_types:
+                logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}")
+                return []
+        else:
+            # Search all indexed types
+            search_types = list(indexed_types)

-        # Score and filter notes
-        scored_notes = []
-        for note in notes:
+        # Fetch documents for each type and score them
+        all_documents = []
+        for dtype in search_types:
+            documents = await self._fetch_documents(nextcloud_client, dtype)
+            for doc in documents:
+                doc["_doc_type"] = dtype  # Tag with type
+            all_documents.extend(documents)
+
+        logger.debug(f"Fetched {len(all_documents)} total documents for keyword search")
+
+        # Score and filter documents
+        scored_results = []
+        for doc in all_documents:
+            dtype = doc.get("_doc_type", "note")
            score = self._calculate_score(
                query_tokens,
-                note.get("title", ""),
-                note.get("content", ""),
+                doc.get("title", ""),
+                doc.get("content", ""),
            )

            if score > 0:  # Only include matches
                # Extract excerpt with context
                excerpt = self._extract_excerpt(
-                    note.get("content", ""),
+                    doc.get("content", ""),
                    query_tokens,
                    max_length=200,
                )

-                scored_notes.append(
+                scored_results.append(
                    SearchResult(
-                        id=note["id"],
-                        doc_type="note",
-                        title=note.get("title", "Untitled"),
+                        id=doc["id"],
+                        doc_type=dtype,
+                        title=doc.get("title", "Untitled"),
                        excerpt=excerpt,
                        score=score,
                        metadata={
-                            "category": note.get("category", ""),
-                            "modified": note.get("modified"),
+                            "category": doc.get("category", ""),
+                            "modified": doc.get("modified"),
                        },
                    )
                )

        # Sort by score (descending) and limit
-        scored_notes.sort(key=lambda x: x.score, reverse=True)
-        results = scored_notes[:limit]
+        scored_results.sort(key=lambda x: x.score, reverse=True)
+        results = scored_results[:limit]

        logger.info(f"Keyword search returned {len(results)} matching notes")
        if results:
@@ -120,6 +138,32 @@ class KeywordSearchAlgorithm(SearchAlgorithm):

        return results

+    async def _fetch_documents(
+        self, nextcloud_client: NextcloudClientProtocol, doc_type: str
+    ) -> list[dict[str, Any]]:
+        """Fetch documents of a specific type from Nextcloud.
+
+        Args:
+            nextcloud_client: Client for API access
+            doc_type: Document type to fetch ("note", "file", "calendar", etc.)
+
+        Returns:
+            List of document dictionaries with at minimum: id, title, content
+        """
+        if doc_type == "note":
+            return await nextcloud_client.notes.get_notes()
+        elif doc_type == "file":
+            # Future: fetch files when indexed
+            logger.info("File documents not yet supported for keyword search")
+            return []
+        elif doc_type == "calendar":
+            # Future: fetch calendar events when indexed
+            logger.info("Calendar documents not yet supported for keyword search")
+            return []
+        else:
+            logger.warning(f"Unknown document type '{doc_type}' for keyword search")
+            return []
+
    def _process_query(self, query: str) -> list[str]:
        """Tokenize and normalize query.

@@ -6,11 +6,14 @@ from typing import Any
 from httpx import HTTPStatusError
 from qdrant_client.models import FieldCondition, Filter, MatchValue

-from nextcloud_mcp_server.client import NextcloudClient
 from nextcloud_mcp_server.config import get_settings
 from nextcloud_mcp_server.embedding import get_embedding_service
 from nextcloud_mcp_server.observability.metrics import record_qdrant_operation
-from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
+from nextcloud_mcp_server.search.algorithms import (
+    NextcloudClientProtocol,
+    SearchAlgorithm,
+    SearchResult,
+)
 from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client

 logger = logging.getLogger(__name__)
@@ -45,7 +48,7 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
        user_id: str,
        limit: int = 10,
        doc_type: str | None = None,
-        nextcloud_client: NextcloudClient | None = None,
+        nextcloud_client: NextcloudClientProtocol | None = None,
        **kwargs: Any,
    ) -> list[SearchResult]:
        """Execute semantic search using vector similarity.
@@ -144,9 +147,13 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
        self,
        points: list[Any],
        limit: int,
-        nextcloud_client: NextcloudClient | None,
+        nextcloud_client: NextcloudClientProtocol | None,
    ) -> list[SearchResult]:
-        """Deduplicate results by doc_id and verify access.
+        """Deduplicate results by (doc_id, doc_type) and verify access.
+
+        Supports multiple document types with dispatch to appropriate client methods.
+        Deduplication is now by (doc_id, doc_type) tuple to handle cases where
+        the same ID might exist across different document types.

        Args:
            points: Qdrant search results
@@ -156,58 +163,32 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
        Returns:
            List of SearchResult objects
        """
-        seen_doc_ids = set()
+        seen_docs = set()  # Track (doc_id, doc_type) tuples
        results = []

        for result in points:
            doc_id = int(result.payload["doc_id"])
            doc_type = result.payload.get("doc_type", "note")
+            doc_key = (doc_id, doc_type)

            # Skip if we've already seen this document
-            if doc_id in seen_doc_ids:
+            if doc_key in seen_docs:
                continue

-            seen_doc_ids.add(doc_id)
+            seen_docs.add(doc_key)

            # Verify access via Nextcloud API if client provided
-            # Currently only supports notes
-            if nextcloud_client and doc_type == "note":
-                try:
-                    note = await nextcloud_client.notes.get_note(doc_id)
+            # Dispatch to appropriate client based on doc_type
+            verified_result = None

-                    results.append(
-                        SearchResult(
-                            id=doc_id,
-                            doc_type="note",
-                            title=result.payload["title"],
-                            excerpt=result.payload["excerpt"],
-                            score=result.score,
-                            metadata={
-                                "category": note.get("category", ""),
-                                "chunk_index": result.payload["chunk_index"],
-                                "total_chunks": result.payload["total_chunks"],
-                            },
-                        )
-                    )
+            if nextcloud_client:
+                verified_result = await self._verify_document_access(
+                    nextcloud_client, doc_id, doc_type, result
+                )

-                    if len(results) >= limit:
-                        break
-
-                except HTTPStatusError as e:
-                    if e.response.status_code in (403, 404):
-                        # User lost access or document deleted
-                        logger.debug(
-                            f"Skipping note {doc_id}: {e.response.status_code}"
-                        )
-                        continue
-                    else:
-                        # Log other errors but continue processing
-                        logger.warning(
-                            f"Error verifying access to note {doc_id}: "
-                            f"{e.response.status_code}"
-                        )
-                        continue
-            else:
+            if verified_result:
+                results.append(verified_result)
+            elif not nextcloud_client:
                # No access verification, return result directly
                results.append(
                    SearchResult(
@@ -223,7 +204,72 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
                    )
                )

-                if len(results) >= limit:
-                    break
+            if len(results) >= limit:
+                break

        return results
+
+    async def _verify_document_access(
+        self,
+        nextcloud_client: NextcloudClientProtocol,
+        doc_id: int,
+        doc_type: str,
+        qdrant_result: Any,
+    ) -> SearchResult | None:
+        """Verify user has access to a document via Nextcloud API.
+
+        Dispatches to appropriate client method based on document type.
+
+        Args:
+            nextcloud_client: Client for API access
+            doc_id: Document ID
+            doc_type: Document type ("note", "file", "calendar", etc.)
+            qdrant_result: Original Qdrant search result
+
+        Returns:
+            SearchResult if access verified, None if access denied or error
+        """
+        try:
+            if doc_type == "note":
+                note = await nextcloud_client.notes.get_note(doc_id)
+                return SearchResult(
+                    id=doc_id,
+                    doc_type="note",
+                    title=qdrant_result.payload["title"],
+                    excerpt=qdrant_result.payload["excerpt"],
+                    score=qdrant_result.score,
+                    metadata={
+                        "category": note.get("category", ""),
+                        "chunk_index": qdrant_result.payload["chunk_index"],
+                        "total_chunks": qdrant_result.payload["total_chunks"],
+                    },
+                )
+            elif doc_type == "file":
+                # Future: verify file access when files are indexed
+                logger.info(
+                    f"File {doc_id} found in search but file verification not yet implemented"
+                )
+                return None
+            elif doc_type == "calendar":
+                # Future: verify calendar access when calendar events are indexed
+                logger.info(
+                    f"Calendar event {doc_id} found in search but calendar verification not yet implemented"
+                )
+                return None
+            else:
+                logger.warning(
+                    f"Unknown document type '{doc_type}' for doc_id {doc_id}"
+                )
+                return None
+
+        except HTTPStatusError as e:
+            if e.response.status_code in (403, 404):
+                # User lost access or document deleted
+                logger.debug(f"Skipping {doc_type} {doc_id}: {e.response.status_code}")
+                return None
+            else:
+                # Log other errors but continue processing
+                logger.warning(
+                    f"Error verifying access to {doc_type} {doc_id}: {e.response.status_code}"
+                )
+                return None