feat: Add multi-document Protocol with cross-app search support

Implements NextcloudClientProtocol for multi-document type search following
user requirement that document types are not 1:1 with apps (e.g., Notes app
specializes in markdown, while Files/WebDAV handles multiple file types).

Key Changes:
- NextcloudClientProtocol: Generic protocol with app-specific client properties
- get_indexed_doc_types(): Query Qdrant for actually-indexed document types
- Document dispatch: All algorithms check Qdrant before attempting access
- Cross-type deduplication: Use (doc_id, doc_type) tuples in hybrid RRF

Search Algorithm Updates:
- Semantic: Added _verify_document_access() with dispatch to appropriate client
  - Deduplication by (doc_id, doc_type) tuple
  - Only "note" verification implemented, others return None with info log
- Keyword: Added _fetch_documents() dispatch method
  - Queries Qdrant for available types before fetching
  - Supports cross-app search when doc_type=None
- Fuzzy: Same pattern as keyword search
- Hybrid: Already uses (doc_id, doc_type) for deduplication (no changes needed)

Future-Proof Design:
- File/calendar verification stubs in place
- Clear logging when unsupported types found
- Easy to extend when processor indexes new document types

Currently Supported:
- "note" documents fully implemented and tested
- Other types gracefully handled (logged but skipped)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Chris Coutinho
2025-11-15 01:19:29 +01:00
parent f3bdb8b885
commit b5b03bfd78
6 changed files with 360 additions and 100 deletions
+8 -1
View File
@@ -10,15 +10,22 @@ All algorithms share the same interface and can be used interchangeably by both
MCP tools and the visualization pane.
"""
from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
from nextcloud_mcp_server.search.algorithms import (
NextcloudClientProtocol,
SearchAlgorithm,
SearchResult,
get_indexed_doc_types,
)
from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm
from nextcloud_mcp_server.search.hybrid import HybridSearchAlgorithm
from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm
from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm
__all__ = [
"NextcloudClientProtocol",
"SearchAlgorithm",
"SearchResult",
"get_indexed_doc_types",
"SemanticSearchAlgorithm",
"KeywordSearchAlgorithm",
"FuzzySearchAlgorithm",
+114 -1
View File
@@ -2,7 +2,120 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any
from typing import Any, Protocol, runtime_checkable
@runtime_checkable
class NextcloudClientProtocol(Protocol):
"""Protocol for Nextcloud client supporting multi-document search.
This protocol defines the interface that search algorithms need from a
Nextcloud client to access documents across different apps (Notes, Files,
Calendar, etc.). The client provides access to app-specific sub-clients
that handle the actual API calls.
Document types (e.g., "note", "file", "calendar") are NOT 1:1 with apps.
For example, the Notes app specializes in markdown files, while Files/WebDAV
handles multiple file types. The abstraction is at the document type level.
Search algorithms query Qdrant to determine which document types are actually
indexed before attempting to access them, enabling graceful cross-app search.
"""
username: str
# App-specific clients that search algorithms dispatch to
@property
def notes(self) -> Any:
"""Notes client for accessing note documents."""
...
@property
def webdav(self) -> Any:
"""WebDAV client for accessing file documents."""
...
@property
def calendar(self) -> Any:
"""Calendar client for accessing event/task documents."""
...
@property
def contacts(self) -> Any:
"""Contacts client for accessing contact card documents."""
...
@property
def deck(self) -> Any:
"""Deck client for accessing deck card documents."""
...
@property
def cookbook(self) -> Any:
"""Cookbook client for accessing recipe documents."""
...
@property
def tables(self) -> Any:
"""Tables client for accessing table row documents."""
...
async def get_indexed_doc_types(user_id: str) -> set[str]:
"""Query Qdrant to get actually-indexed document types for a user.
This enables search algorithms to check which document types are available
before attempting to search/verify them, allowing graceful cross-app search.
Args:
user_id: User ID to filter by
Returns:
Set of document type strings (e.g., {"note", "file", "calendar"})
Example:
>>> types = await get_indexed_doc_types("alice")
>>> if "note" in types:
... # Search notes
"""
import logging
from qdrant_client.models import FieldCondition, Filter, MatchValue
from nextcloud_mcp_server.config import get_settings
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
logger = logging.getLogger(__name__)
settings = get_settings()
qdrant_client = await get_qdrant_client()
collection = settings.qdrant_collection
# Use scroll to sample documents and extract doc_types
# Note: This could be optimized with a facet/aggregation query if Qdrant adds support
try:
scroll_results, _next_offset = await qdrant_client.scroll(
collection_name=collection,
scroll_filter=Filter(
must=[FieldCondition(key="user_id", match=MatchValue(value=user_id))]
),
limit=1000, # Sample size to discover types
with_payload=["doc_type"],
with_vectors=False, # Don't need vectors for type discovery
)
doc_types = {
point.payload.get("doc_type")
for point in scroll_results
if point.payload.get("doc_type")
}
logger.debug(f"Found indexed document types for user {user_id}: {doc_types}")
return doc_types
except Exception as e:
logger.warning(f"Failed to query Qdrant for doc_types: {e}")
return set()
@dataclass
+69 -22
View File
@@ -3,8 +3,12 @@
import logging
from typing import Any
from nextcloud_mcp_server.client import NextcloudClient
from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
from nextcloud_mcp_server.search.algorithms import (
NextcloudClientProtocol,
SearchAlgorithm,
SearchResult,
get_indexed_doc_types,
)
logger = logging.getLogger(__name__)
@@ -38,7 +42,7 @@ class FuzzySearchAlgorithm(SearchAlgorithm):
user_id: str,
limit: int = 10,
doc_type: str | None = None,
nextcloud_client: NextcloudClient | None = None,
nextcloud_client: NextcloudClientProtocol | None = None,
**kwargs: Any,
) -> list[SearchResult]:
"""Execute fuzzy search using character overlap.
@@ -67,22 +71,39 @@ class FuzzySearchAlgorithm(SearchAlgorithm):
f"limit={limit}, threshold={threshold}, doc_type={doc_type}"
)
# Currently only supports notes
if doc_type and doc_type != "note":
logger.warning(f"Fuzzy search not yet implemented for doc_type={doc_type}")
return []
# Get available document types from Qdrant
indexed_types = await get_indexed_doc_types(user_id)
logger.debug(f"Indexed document types for user: {indexed_types}")
# Fetch all notes for the user
notes = await nextcloud_client.notes.get_notes()
logger.debug(f"Fetched {len(notes)} notes for fuzzy search")
# Determine which types to search
if doc_type:
# Search specific type if requested
search_types = [doc_type] if doc_type in indexed_types else []
if not search_types:
logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}")
return []
else:
# Search all indexed types
search_types = list(indexed_types)
# Score and filter notes
scored_notes = []
# Fetch documents for each type and score them
all_documents = []
for dtype in search_types:
documents = await self._fetch_documents(nextcloud_client, dtype)
for doc in documents:
doc["_doc_type"] = dtype # Tag with type
all_documents.extend(documents)
logger.debug(f"Fetched {len(all_documents)} total documents for fuzzy search")
# Score and filter documents
scored_results = []
query_lower = query.lower()
for note in notes:
title = note.get("title", "")
content = note.get("content", "")
for doc in all_documents:
dtype = doc.get("_doc_type", "note")
title = doc.get("title", "")
content = doc.get("content", "")
# Check title match
title_score = self._calculate_char_overlap(query_lower, title.lower())
@@ -100,16 +121,16 @@ class FuzzySearchAlgorithm(SearchAlgorithm):
else:
excerpt = self._extract_excerpt(content, max_length=200)
scored_notes.append(
scored_results.append(
SearchResult(
id=note["id"],
doc_type="note",
id=doc["id"],
doc_type=dtype,
title=title or "Untitled",
excerpt=excerpt,
score=best_score,
metadata={
"category": note.get("category", ""),
"modified": note.get("modified"),
"category": doc.get("category", ""),
"modified": doc.get("modified"),
"match_location": "title"
if title_score >= content_score
else "content",
@@ -118,8 +139,8 @@ class FuzzySearchAlgorithm(SearchAlgorithm):
)
# Sort by score (descending) and limit
scored_notes.sort(key=lambda x: x.score, reverse=True)
results = scored_notes[:limit]
scored_results.sort(key=lambda x: x.score, reverse=True)
results = scored_results[:limit]
logger.info(f"Fuzzy search returned {len(results)} matching notes")
if results:
@@ -131,6 +152,32 @@ class FuzzySearchAlgorithm(SearchAlgorithm):
return results
async def _fetch_documents(
self, nextcloud_client: NextcloudClientProtocol, doc_type: str
) -> list[dict[str, Any]]:
"""Fetch documents of a specific type from Nextcloud.
Args:
nextcloud_client: Client for API access
doc_type: Document type to fetch ("note", "file", "calendar", etc.)
Returns:
List of document dictionaries with at minimum: id, title, content
"""
if doc_type == "note":
return await nextcloud_client.notes.get_notes()
elif doc_type == "file":
# Future: fetch files when indexed
logger.info("File documents not yet supported for fuzzy search")
return []
elif doc_type == "calendar":
# Future: fetch calendar events when indexed
logger.info("Calendar documents not yet supported for fuzzy search")
return []
else:
logger.warning(f"Unknown document type '{doc_type}' for fuzzy search")
return []
def _calculate_char_overlap(self, query: str, text: str) -> float:
"""Calculate character overlap ratio between query and text.
+6 -3
View File
@@ -5,8 +5,11 @@ import logging
from collections import defaultdict
from typing import Any
from nextcloud_mcp_server.client import NextcloudClient
from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
from nextcloud_mcp_server.search.algorithms import (
NextcloudClientProtocol,
SearchAlgorithm,
SearchResult,
)
from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm
from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm
from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm
@@ -82,7 +85,7 @@ class HybridSearchAlgorithm(SearchAlgorithm):
user_id: str,
limit: int = 10,
doc_type: str | None = None,
nextcloud_client: NextcloudClient | None = None,
nextcloud_client: NextcloudClientProtocol | None = None,
**kwargs: Any,
) -> list[SearchResult]:
"""Execute hybrid search using RRF to combine algorithms.
+71 -27
View File
@@ -3,8 +3,12 @@
import logging
from typing import Any
from nextcloud_mcp_server.client import NextcloudClient
from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
from nextcloud_mcp_server.search.algorithms import (
NextcloudClientProtocol,
SearchAlgorithm,
SearchResult,
get_indexed_doc_types,
)
logger = logging.getLogger(__name__)
@@ -32,7 +36,7 @@ class KeywordSearchAlgorithm(SearchAlgorithm):
user_id: str,
limit: int = 10,
doc_type: str | None = None,
nextcloud_client: NextcloudClient | None = None,
nextcloud_client: NextcloudClientProtocol | None = None,
**kwargs: Any,
) -> list[SearchResult]:
"""Execute keyword search using token matching.
@@ -63,52 +67,66 @@ class KeywordSearchAlgorithm(SearchAlgorithm):
query_tokens = self._process_query(query)
logger.debug(f"Query tokens: {query_tokens}")
# Currently only supports notes
# TODO: Extend to other document types (files, calendar, etc.)
if doc_type and doc_type != "note":
logger.warning(
f"Keyword search not yet implemented for doc_type={doc_type}"
)
return []
# Get available document types from Qdrant
indexed_types = await get_indexed_doc_types(user_id)
logger.debug(f"Indexed document types for user: {indexed_types}")
# Fetch all notes for the user
notes = await nextcloud_client.notes.get_notes()
logger.debug(f"Fetched {len(notes)} notes for keyword search")
# Determine which types to search
if doc_type:
# Search specific type if requested
search_types = [doc_type] if doc_type in indexed_types else []
if not search_types:
logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}")
return []
else:
# Search all indexed types
search_types = list(indexed_types)
# Score and filter notes
scored_notes = []
for note in notes:
# Fetch documents for each type and score them
all_documents = []
for dtype in search_types:
documents = await self._fetch_documents(nextcloud_client, dtype)
for doc in documents:
doc["_doc_type"] = dtype # Tag with type
all_documents.extend(documents)
logger.debug(f"Fetched {len(all_documents)} total documents for keyword search")
# Score and filter documents
scored_results = []
for doc in all_documents:
dtype = doc.get("_doc_type", "note")
score = self._calculate_score(
query_tokens,
note.get("title", ""),
note.get("content", ""),
doc.get("title", ""),
doc.get("content", ""),
)
if score > 0: # Only include matches
# Extract excerpt with context
excerpt = self._extract_excerpt(
note.get("content", ""),
doc.get("content", ""),
query_tokens,
max_length=200,
)
scored_notes.append(
scored_results.append(
SearchResult(
id=note["id"],
doc_type="note",
title=note.get("title", "Untitled"),
id=doc["id"],
doc_type=dtype,
title=doc.get("title", "Untitled"),
excerpt=excerpt,
score=score,
metadata={
"category": note.get("category", ""),
"modified": note.get("modified"),
"category": doc.get("category", ""),
"modified": doc.get("modified"),
},
)
)
# Sort by score (descending) and limit
scored_notes.sort(key=lambda x: x.score, reverse=True)
results = scored_notes[:limit]
scored_results.sort(key=lambda x: x.score, reverse=True)
results = scored_results[:limit]
logger.info(f"Keyword search returned {len(results)} matching notes")
if results:
@@ -120,6 +138,32 @@ class KeywordSearchAlgorithm(SearchAlgorithm):
return results
async def _fetch_documents(
self, nextcloud_client: NextcloudClientProtocol, doc_type: str
) -> list[dict[str, Any]]:
"""Fetch documents of a specific type from Nextcloud.
Args:
nextcloud_client: Client for API access
doc_type: Document type to fetch ("note", "file", "calendar", etc.)
Returns:
List of document dictionaries with at minimum: id, title, content
"""
if doc_type == "note":
return await nextcloud_client.notes.get_notes()
elif doc_type == "file":
# Future: fetch files when indexed
logger.info("File documents not yet supported for keyword search")
return []
elif doc_type == "calendar":
# Future: fetch calendar events when indexed
logger.info("Calendar documents not yet supported for keyword search")
return []
else:
logger.warning(f"Unknown document type '{doc_type}' for keyword search")
return []
def _process_query(self, query: str) -> list[str]:
"""Tokenize and normalize query.
+92 -46
View File
@@ -6,11 +6,14 @@ from typing import Any
from httpx import HTTPStatusError
from qdrant_client.models import FieldCondition, Filter, MatchValue
from nextcloud_mcp_server.client import NextcloudClient
from nextcloud_mcp_server.config import get_settings
from nextcloud_mcp_server.embedding import get_embedding_service
from nextcloud_mcp_server.observability.metrics import record_qdrant_operation
from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult
from nextcloud_mcp_server.search.algorithms import (
NextcloudClientProtocol,
SearchAlgorithm,
SearchResult,
)
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
logger = logging.getLogger(__name__)
@@ -45,7 +48,7 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
user_id: str,
limit: int = 10,
doc_type: str | None = None,
nextcloud_client: NextcloudClient | None = None,
nextcloud_client: NextcloudClientProtocol | None = None,
**kwargs: Any,
) -> list[SearchResult]:
"""Execute semantic search using vector similarity.
@@ -144,9 +147,13 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
self,
points: list[Any],
limit: int,
nextcloud_client: NextcloudClient | None,
nextcloud_client: NextcloudClientProtocol | None,
) -> list[SearchResult]:
"""Deduplicate results by doc_id and verify access.
"""Deduplicate results by (doc_id, doc_type) and verify access.
Supports multiple document types with dispatch to appropriate client methods.
Deduplication is now by (doc_id, doc_type) tuple to handle cases where
the same ID might exist across different document types.
Args:
points: Qdrant search results
@@ -156,58 +163,32 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
Returns:
List of SearchResult objects
"""
seen_doc_ids = set()
seen_docs = set() # Track (doc_id, doc_type) tuples
results = []
for result in points:
doc_id = int(result.payload["doc_id"])
doc_type = result.payload.get("doc_type", "note")
doc_key = (doc_id, doc_type)
# Skip if we've already seen this document
if doc_id in seen_doc_ids:
if doc_key in seen_docs:
continue
seen_doc_ids.add(doc_id)
seen_docs.add(doc_key)
# Verify access via Nextcloud API if client provided
# Currently only supports notes
if nextcloud_client and doc_type == "note":
try:
note = await nextcloud_client.notes.get_note(doc_id)
# Dispatch to appropriate client based on doc_type
verified_result = None
results.append(
SearchResult(
id=doc_id,
doc_type="note",
title=result.payload["title"],
excerpt=result.payload["excerpt"],
score=result.score,
metadata={
"category": note.get("category", ""),
"chunk_index": result.payload["chunk_index"],
"total_chunks": result.payload["total_chunks"],
},
)
)
if nextcloud_client:
verified_result = await self._verify_document_access(
nextcloud_client, doc_id, doc_type, result
)
if len(results) >= limit:
break
except HTTPStatusError as e:
if e.response.status_code in (403, 404):
# User lost access or document deleted
logger.debug(
f"Skipping note {doc_id}: {e.response.status_code}"
)
continue
else:
# Log other errors but continue processing
logger.warning(
f"Error verifying access to note {doc_id}: "
f"{e.response.status_code}"
)
continue
else:
if verified_result:
results.append(verified_result)
elif not nextcloud_client:
# No access verification, return result directly
results.append(
SearchResult(
@@ -223,7 +204,72 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
)
)
if len(results) >= limit:
break
if len(results) >= limit:
break
return results
async def _verify_document_access(
self,
nextcloud_client: NextcloudClientProtocol,
doc_id: int,
doc_type: str,
qdrant_result: Any,
) -> SearchResult | None:
"""Verify user has access to a document via Nextcloud API.
Dispatches to appropriate client method based on document type.
Args:
nextcloud_client: Client for API access
doc_id: Document ID
doc_type: Document type ("note", "file", "calendar", etc.)
qdrant_result: Original Qdrant search result
Returns:
SearchResult if access verified, None if access denied or error
"""
try:
if doc_type == "note":
note = await nextcloud_client.notes.get_note(doc_id)
return SearchResult(
id=doc_id,
doc_type="note",
title=qdrant_result.payload["title"],
excerpt=qdrant_result.payload["excerpt"],
score=qdrant_result.score,
metadata={
"category": note.get("category", ""),
"chunk_index": qdrant_result.payload["chunk_index"],
"total_chunks": qdrant_result.payload["total_chunks"],
},
)
elif doc_type == "file":
# Future: verify file access when files are indexed
logger.info(
f"File {doc_id} found in search but file verification not yet implemented"
)
return None
elif doc_type == "calendar":
# Future: verify calendar access when calendar events are indexed
logger.info(
f"Calendar event {doc_id} found in search but calendar verification not yet implemented"
)
return None
else:
logger.warning(
f"Unknown document type '{doc_type}' for doc_id {doc_id}"
)
return None
except HTTPStatusError as e:
if e.response.status_code in (403, 404):
# User lost access or document deleted
logger.debug(f"Skipping {doc_type} {doc_id}: {e.response.status_code}")
return None
else:
# Log other errors but continue processing
logger.warning(
f"Error verifying access to {doc_type} {doc_id}: {e.response.status_code}"
)
return None