From b5b03bfd78edebf0e4a177717b11b34226edce53 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sat, 15 Nov 2025 01:19:29 +0100 Subject: [PATCH] feat: Add multi-document Protocol with cross-app search support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements NextcloudClientProtocol for multi-document type search following user requirement that document types are not 1:1 with apps (e.g., Notes app specializes in markdown, while Files/WebDAV handles multiple file types). Key Changes: - NextcloudClientProtocol: Generic protocol with app-specific client properties - get_indexed_doc_types(): Query Qdrant for actually-indexed document types - Document dispatch: All algorithms check Qdrant before attempting access - Cross-type deduplication: Use (doc_id, doc_type) tuples in hybrid RRF Search Algorithm Updates: - Semantic: Added _verify_document_access() with dispatch to appropriate client - Deduplication by (doc_id, doc_type) tuple - Only "note" verification implemented, others return None with info log - Keyword: Added _fetch_documents() dispatch method - Queries Qdrant for available types before fetching - Supports cross-app search when doc_type=None - Fuzzy: Same pattern as keyword search - Hybrid: Already uses (doc_id, doc_type) for deduplication (no changes needed) Future-Proof Design: - File/calendar verification stubs in place - Clear logging when unsupported types found - Easy to extend when processor indexes new document types Currently Supported: - "note" documents fully implemented and tested - Other types gracefully handled (logged but skipped) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- nextcloud_mcp_server/search/__init__.py | 9 +- nextcloud_mcp_server/search/algorithms.py | 115 +++++++++++++++++- nextcloud_mcp_server/search/fuzzy.py | 91 ++++++++++---- nextcloud_mcp_server/search/hybrid.py | 9 +- nextcloud_mcp_server/search/keyword.py | 98 ++++++++++----- nextcloud_mcp_server/search/semantic.py | 138 ++++++++++++++-------- 6 files changed, 360 insertions(+), 100 deletions(-) diff --git a/nextcloud_mcp_server/search/__init__.py b/nextcloud_mcp_server/search/__init__.py index 1da5a84..d6ec32a 100644 --- a/nextcloud_mcp_server/search/__init__.py +++ b/nextcloud_mcp_server/search/__init__.py @@ -10,15 +10,22 @@ All algorithms share the same interface and can be used interchangeably by both MCP tools and the visualization pane. """ -from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.algorithms import ( + NextcloudClientProtocol, + SearchAlgorithm, + SearchResult, + get_indexed_doc_types, +) from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm from nextcloud_mcp_server.search.hybrid import HybridSearchAlgorithm from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm __all__ = [ + "NextcloudClientProtocol", "SearchAlgorithm", "SearchResult", + "get_indexed_doc_types", "SemanticSearchAlgorithm", "KeywordSearchAlgorithm", "FuzzySearchAlgorithm", diff --git a/nextcloud_mcp_server/search/algorithms.py b/nextcloud_mcp_server/search/algorithms.py index 560e113..2a7536c 100644 --- a/nextcloud_mcp_server/search/algorithms.py +++ b/nextcloud_mcp_server/search/algorithms.py @@ -2,7 +2,120 @@ from abc import ABC, abstractmethod from dataclasses import dataclass -from typing import Any +from typing import Any, Protocol, runtime_checkable + + +@runtime_checkable +class NextcloudClientProtocol(Protocol): + """Protocol for Nextcloud client supporting multi-document search. + + This protocol defines the interface that search algorithms need from a + Nextcloud client to access documents across different apps (Notes, Files, + Calendar, etc.). The client provides access to app-specific sub-clients + that handle the actual API calls. + + Document types (e.g., "note", "file", "calendar") are NOT 1:1 with apps. + For example, the Notes app specializes in markdown files, while Files/WebDAV + handles multiple file types. The abstraction is at the document type level. + + Search algorithms query Qdrant to determine which document types are actually + indexed before attempting to access them, enabling graceful cross-app search. + """ + + username: str + + # App-specific clients that search algorithms dispatch to + @property + def notes(self) -> Any: + """Notes client for accessing note documents.""" + ... + + @property + def webdav(self) -> Any: + """WebDAV client for accessing file documents.""" + ... + + @property + def calendar(self) -> Any: + """Calendar client for accessing event/task documents.""" + ... + + @property + def contacts(self) -> Any: + """Contacts client for accessing contact card documents.""" + ... + + @property + def deck(self) -> Any: + """Deck client for accessing deck card documents.""" + ... + + @property + def cookbook(self) -> Any: + """Cookbook client for accessing recipe documents.""" + ... + + @property + def tables(self) -> Any: + """Tables client for accessing table row documents.""" + ... + + +async def get_indexed_doc_types(user_id: str) -> set[str]: + """Query Qdrant to get actually-indexed document types for a user. + + This enables search algorithms to check which document types are available + before attempting to search/verify them, allowing graceful cross-app search. + + Args: + user_id: User ID to filter by + + Returns: + Set of document type strings (e.g., {"note", "file", "calendar"}) + + Example: + >>> types = await get_indexed_doc_types("alice") + >>> if "note" in types: + ... # Search notes + """ + import logging + + from qdrant_client.models import FieldCondition, Filter, MatchValue + + from nextcloud_mcp_server.config import get_settings + from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client + + logger = logging.getLogger(__name__) + settings = get_settings() + + qdrant_client = await get_qdrant_client() + collection = settings.qdrant_collection + + # Use scroll to sample documents and extract doc_types + # Note: This could be optimized with a facet/aggregation query if Qdrant adds support + try: + scroll_results, _next_offset = await qdrant_client.scroll( + collection_name=collection, + scroll_filter=Filter( + must=[FieldCondition(key="user_id", match=MatchValue(value=user_id))] + ), + limit=1000, # Sample size to discover types + with_payload=["doc_type"], + with_vectors=False, # Don't need vectors for type discovery + ) + + doc_types = { + point.payload.get("doc_type") + for point in scroll_results + if point.payload.get("doc_type") + } + + logger.debug(f"Found indexed document types for user {user_id}: {doc_types}") + return doc_types + + except Exception as e: + logger.warning(f"Failed to query Qdrant for doc_types: {e}") + return set() @dataclass diff --git a/nextcloud_mcp_server/search/fuzzy.py b/nextcloud_mcp_server/search/fuzzy.py index 479459f..acd57d1 100644 --- a/nextcloud_mcp_server/search/fuzzy.py +++ b/nextcloud_mcp_server/search/fuzzy.py @@ -3,8 +3,12 @@ import logging from typing import Any -from nextcloud_mcp_server.client import NextcloudClient -from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.algorithms import ( + NextcloudClientProtocol, + SearchAlgorithm, + SearchResult, + get_indexed_doc_types, +) logger = logging.getLogger(__name__) @@ -38,7 +42,7 @@ class FuzzySearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClient | None = None, + nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute fuzzy search using character overlap. @@ -67,22 +71,39 @@ class FuzzySearchAlgorithm(SearchAlgorithm): f"limit={limit}, threshold={threshold}, doc_type={doc_type}" ) - # Currently only supports notes - if doc_type and doc_type != "note": - logger.warning(f"Fuzzy search not yet implemented for doc_type={doc_type}") - return [] + # Get available document types from Qdrant + indexed_types = await get_indexed_doc_types(user_id) + logger.debug(f"Indexed document types for user: {indexed_types}") - # Fetch all notes for the user - notes = await nextcloud_client.notes.get_notes() - logger.debug(f"Fetched {len(notes)} notes for fuzzy search") + # Determine which types to search + if doc_type: + # Search specific type if requested + search_types = [doc_type] if doc_type in indexed_types else [] + if not search_types: + logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}") + return [] + else: + # Search all indexed types + search_types = list(indexed_types) - # Score and filter notes - scored_notes = [] + # Fetch documents for each type and score them + all_documents = [] + for dtype in search_types: + documents = await self._fetch_documents(nextcloud_client, dtype) + for doc in documents: + doc["_doc_type"] = dtype # Tag with type + all_documents.extend(documents) + + logger.debug(f"Fetched {len(all_documents)} total documents for fuzzy search") + + # Score and filter documents + scored_results = [] query_lower = query.lower() - for note in notes: - title = note.get("title", "") - content = note.get("content", "") + for doc in all_documents: + dtype = doc.get("_doc_type", "note") + title = doc.get("title", "") + content = doc.get("content", "") # Check title match title_score = self._calculate_char_overlap(query_lower, title.lower()) @@ -100,16 +121,16 @@ class FuzzySearchAlgorithm(SearchAlgorithm): else: excerpt = self._extract_excerpt(content, max_length=200) - scored_notes.append( + scored_results.append( SearchResult( - id=note["id"], - doc_type="note", + id=doc["id"], + doc_type=dtype, title=title or "Untitled", excerpt=excerpt, score=best_score, metadata={ - "category": note.get("category", ""), - "modified": note.get("modified"), + "category": doc.get("category", ""), + "modified": doc.get("modified"), "match_location": "title" if title_score >= content_score else "content", @@ -118,8 +139,8 @@ class FuzzySearchAlgorithm(SearchAlgorithm): ) # Sort by score (descending) and limit - scored_notes.sort(key=lambda x: x.score, reverse=True) - results = scored_notes[:limit] + scored_results.sort(key=lambda x: x.score, reverse=True) + results = scored_results[:limit] logger.info(f"Fuzzy search returned {len(results)} matching notes") if results: @@ -131,6 +152,32 @@ class FuzzySearchAlgorithm(SearchAlgorithm): return results + async def _fetch_documents( + self, nextcloud_client: NextcloudClientProtocol, doc_type: str + ) -> list[dict[str, Any]]: + """Fetch documents of a specific type from Nextcloud. + + Args: + nextcloud_client: Client for API access + doc_type: Document type to fetch ("note", "file", "calendar", etc.) + + Returns: + List of document dictionaries with at minimum: id, title, content + """ + if doc_type == "note": + return await nextcloud_client.notes.get_notes() + elif doc_type == "file": + # Future: fetch files when indexed + logger.info("File documents not yet supported for fuzzy search") + return [] + elif doc_type == "calendar": + # Future: fetch calendar events when indexed + logger.info("Calendar documents not yet supported for fuzzy search") + return [] + else: + logger.warning(f"Unknown document type '{doc_type}' for fuzzy search") + return [] + def _calculate_char_overlap(self, query: str, text: str) -> float: """Calculate character overlap ratio between query and text. diff --git a/nextcloud_mcp_server/search/hybrid.py b/nextcloud_mcp_server/search/hybrid.py index a8778c8..947f1f6 100644 --- a/nextcloud_mcp_server/search/hybrid.py +++ b/nextcloud_mcp_server/search/hybrid.py @@ -5,8 +5,11 @@ import logging from collections import defaultdict from typing import Any -from nextcloud_mcp_server.client import NextcloudClient -from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.algorithms import ( + NextcloudClientProtocol, + SearchAlgorithm, + SearchResult, +) from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm @@ -82,7 +85,7 @@ class HybridSearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClient | None = None, + nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute hybrid search using RRF to combine algorithms. diff --git a/nextcloud_mcp_server/search/keyword.py b/nextcloud_mcp_server/search/keyword.py index 410a7a7..d4e8002 100644 --- a/nextcloud_mcp_server/search/keyword.py +++ b/nextcloud_mcp_server/search/keyword.py @@ -3,8 +3,12 @@ import logging from typing import Any -from nextcloud_mcp_server.client import NextcloudClient -from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.algorithms import ( + NextcloudClientProtocol, + SearchAlgorithm, + SearchResult, + get_indexed_doc_types, +) logger = logging.getLogger(__name__) @@ -32,7 +36,7 @@ class KeywordSearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClient | None = None, + nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute keyword search using token matching. @@ -63,52 +67,66 @@ class KeywordSearchAlgorithm(SearchAlgorithm): query_tokens = self._process_query(query) logger.debug(f"Query tokens: {query_tokens}") - # Currently only supports notes - # TODO: Extend to other document types (files, calendar, etc.) - if doc_type and doc_type != "note": - logger.warning( - f"Keyword search not yet implemented for doc_type={doc_type}" - ) - return [] + # Get available document types from Qdrant + indexed_types = await get_indexed_doc_types(user_id) + logger.debug(f"Indexed document types for user: {indexed_types}") - # Fetch all notes for the user - notes = await nextcloud_client.notes.get_notes() - logger.debug(f"Fetched {len(notes)} notes for keyword search") + # Determine which types to search + if doc_type: + # Search specific type if requested + search_types = [doc_type] if doc_type in indexed_types else [] + if not search_types: + logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}") + return [] + else: + # Search all indexed types + search_types = list(indexed_types) - # Score and filter notes - scored_notes = [] - for note in notes: + # Fetch documents for each type and score them + all_documents = [] + for dtype in search_types: + documents = await self._fetch_documents(nextcloud_client, dtype) + for doc in documents: + doc["_doc_type"] = dtype # Tag with type + all_documents.extend(documents) + + logger.debug(f"Fetched {len(all_documents)} total documents for keyword search") + + # Score and filter documents + scored_results = [] + for doc in all_documents: + dtype = doc.get("_doc_type", "note") score = self._calculate_score( query_tokens, - note.get("title", ""), - note.get("content", ""), + doc.get("title", ""), + doc.get("content", ""), ) if score > 0: # Only include matches # Extract excerpt with context excerpt = self._extract_excerpt( - note.get("content", ""), + doc.get("content", ""), query_tokens, max_length=200, ) - scored_notes.append( + scored_results.append( SearchResult( - id=note["id"], - doc_type="note", - title=note.get("title", "Untitled"), + id=doc["id"], + doc_type=dtype, + title=doc.get("title", "Untitled"), excerpt=excerpt, score=score, metadata={ - "category": note.get("category", ""), - "modified": note.get("modified"), + "category": doc.get("category", ""), + "modified": doc.get("modified"), }, ) ) # Sort by score (descending) and limit - scored_notes.sort(key=lambda x: x.score, reverse=True) - results = scored_notes[:limit] + scored_results.sort(key=lambda x: x.score, reverse=True) + results = scored_results[:limit] logger.info(f"Keyword search returned {len(results)} matching notes") if results: @@ -120,6 +138,32 @@ class KeywordSearchAlgorithm(SearchAlgorithm): return results + async def _fetch_documents( + self, nextcloud_client: NextcloudClientProtocol, doc_type: str + ) -> list[dict[str, Any]]: + """Fetch documents of a specific type from Nextcloud. + + Args: + nextcloud_client: Client for API access + doc_type: Document type to fetch ("note", "file", "calendar", etc.) + + Returns: + List of document dictionaries with at minimum: id, title, content + """ + if doc_type == "note": + return await nextcloud_client.notes.get_notes() + elif doc_type == "file": + # Future: fetch files when indexed + logger.info("File documents not yet supported for keyword search") + return [] + elif doc_type == "calendar": + # Future: fetch calendar events when indexed + logger.info("Calendar documents not yet supported for keyword search") + return [] + else: + logger.warning(f"Unknown document type '{doc_type}' for keyword search") + return [] + def _process_query(self, query: str) -> list[str]: """Tokenize and normalize query. diff --git a/nextcloud_mcp_server/search/semantic.py b/nextcloud_mcp_server/search/semantic.py index c6e632d..e38b16d 100644 --- a/nextcloud_mcp_server/search/semantic.py +++ b/nextcloud_mcp_server/search/semantic.py @@ -6,11 +6,14 @@ from typing import Any from httpx import HTTPStatusError from qdrant_client.models import FieldCondition, Filter, MatchValue -from nextcloud_mcp_server.client import NextcloudClient from nextcloud_mcp_server.config import get_settings from nextcloud_mcp_server.embedding import get_embedding_service from nextcloud_mcp_server.observability.metrics import record_qdrant_operation -from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult +from nextcloud_mcp_server.search.algorithms import ( + NextcloudClientProtocol, + SearchAlgorithm, + SearchResult, +) from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client logger = logging.getLogger(__name__) @@ -45,7 +48,7 @@ class SemanticSearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClient | None = None, + nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute semantic search using vector similarity. @@ -144,9 +147,13 @@ class SemanticSearchAlgorithm(SearchAlgorithm): self, points: list[Any], limit: int, - nextcloud_client: NextcloudClient | None, + nextcloud_client: NextcloudClientProtocol | None, ) -> list[SearchResult]: - """Deduplicate results by doc_id and verify access. + """Deduplicate results by (doc_id, doc_type) and verify access. + + Supports multiple document types with dispatch to appropriate client methods. + Deduplication is now by (doc_id, doc_type) tuple to handle cases where + the same ID might exist across different document types. Args: points: Qdrant search results @@ -156,58 +163,32 @@ class SemanticSearchAlgorithm(SearchAlgorithm): Returns: List of SearchResult objects """ - seen_doc_ids = set() + seen_docs = set() # Track (doc_id, doc_type) tuples results = [] for result in points: doc_id = int(result.payload["doc_id"]) doc_type = result.payload.get("doc_type", "note") + doc_key = (doc_id, doc_type) # Skip if we've already seen this document - if doc_id in seen_doc_ids: + if doc_key in seen_docs: continue - seen_doc_ids.add(doc_id) + seen_docs.add(doc_key) # Verify access via Nextcloud API if client provided - # Currently only supports notes - if nextcloud_client and doc_type == "note": - try: - note = await nextcloud_client.notes.get_note(doc_id) + # Dispatch to appropriate client based on doc_type + verified_result = None - results.append( - SearchResult( - id=doc_id, - doc_type="note", - title=result.payload["title"], - excerpt=result.payload["excerpt"], - score=result.score, - metadata={ - "category": note.get("category", ""), - "chunk_index": result.payload["chunk_index"], - "total_chunks": result.payload["total_chunks"], - }, - ) - ) + if nextcloud_client: + verified_result = await self._verify_document_access( + nextcloud_client, doc_id, doc_type, result + ) - if len(results) >= limit: - break - - except HTTPStatusError as e: - if e.response.status_code in (403, 404): - # User lost access or document deleted - logger.debug( - f"Skipping note {doc_id}: {e.response.status_code}" - ) - continue - else: - # Log other errors but continue processing - logger.warning( - f"Error verifying access to note {doc_id}: " - f"{e.response.status_code}" - ) - continue - else: + if verified_result: + results.append(verified_result) + elif not nextcloud_client: # No access verification, return result directly results.append( SearchResult( @@ -223,7 +204,72 @@ class SemanticSearchAlgorithm(SearchAlgorithm): ) ) - if len(results) >= limit: - break + if len(results) >= limit: + break return results + + async def _verify_document_access( + self, + nextcloud_client: NextcloudClientProtocol, + doc_id: int, + doc_type: str, + qdrant_result: Any, + ) -> SearchResult | None: + """Verify user has access to a document via Nextcloud API. + + Dispatches to appropriate client method based on document type. + + Args: + nextcloud_client: Client for API access + doc_id: Document ID + doc_type: Document type ("note", "file", "calendar", etc.) + qdrant_result: Original Qdrant search result + + Returns: + SearchResult if access verified, None if access denied or error + """ + try: + if doc_type == "note": + note = await nextcloud_client.notes.get_note(doc_id) + return SearchResult( + id=doc_id, + doc_type="note", + title=qdrant_result.payload["title"], + excerpt=qdrant_result.payload["excerpt"], + score=qdrant_result.score, + metadata={ + "category": note.get("category", ""), + "chunk_index": qdrant_result.payload["chunk_index"], + "total_chunks": qdrant_result.payload["total_chunks"], + }, + ) + elif doc_type == "file": + # Future: verify file access when files are indexed + logger.info( + f"File {doc_id} found in search but file verification not yet implemented" + ) + return None + elif doc_type == "calendar": + # Future: verify calendar access when calendar events are indexed + logger.info( + f"Calendar event {doc_id} found in search but calendar verification not yet implemented" + ) + return None + else: + logger.warning( + f"Unknown document type '{doc_type}' for doc_id {doc_id}" + ) + return None + + except HTTPStatusError as e: + if e.response.status_code in (403, 404): + # User lost access or document deleted + logger.debug(f"Skipping {doc_type} {doc_id}: {e.response.status_code}") + return None + else: + # Log other errors but continue processing + logger.warning( + f"Error verifying access to {doc_type} {doc_id}: {e.response.status_code}" + ) + return None