Files
nextcloud-mcp-server/nextcloud_mcp_server/search/algorithms.py
T
Chris Coutinho b5b03bfd78 feat: Add multi-document Protocol with cross-app search support
Implements NextcloudClientProtocol for multi-document type search following
user requirement that document types are not 1:1 with apps (e.g., Notes app
specializes in markdown, while Files/WebDAV handles multiple file types).

Key Changes:
- NextcloudClientProtocol: Generic protocol with app-specific client properties
- get_indexed_doc_types(): Query Qdrant for actually-indexed document types
- Document dispatch: All algorithms check Qdrant before attempting access
- Cross-type deduplication: Use (doc_id, doc_type) tuples in hybrid RRF

Search Algorithm Updates:
- Semantic: Added _verify_document_access() with dispatch to appropriate client
  - Deduplication by (doc_id, doc_type) tuple
  - Only "note" verification implemented, others return None with info log
- Keyword: Added _fetch_documents() dispatch method
  - Queries Qdrant for available types before fetching
  - Supports cross-app search when doc_type=None
- Fuzzy: Same pattern as keyword search
- Hybrid: Already uses (doc_id, doc_type) for deduplication (no changes needed)

Future-Proof Design:
- File/calendar verification stubs in place
- Clear logging when unsupported types found
- Easy to extend when processor indexes new document types

Currently Supported:
- "note" documents fully implemented and tested
- Other types gracefully handled (logged but skipped)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-15 01:19:29 +01:00

201 lines
6.0 KiB
Python

"""Base interfaces and data structures for search algorithms."""
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Protocol, runtime_checkable
@runtime_checkable
class NextcloudClientProtocol(Protocol):
"""Protocol for Nextcloud client supporting multi-document search.
This protocol defines the interface that search algorithms need from a
Nextcloud client to access documents across different apps (Notes, Files,
Calendar, etc.). The client provides access to app-specific sub-clients
that handle the actual API calls.
Document types (e.g., "note", "file", "calendar") are NOT 1:1 with apps.
For example, the Notes app specializes in markdown files, while Files/WebDAV
handles multiple file types. The abstraction is at the document type level.
Search algorithms query Qdrant to determine which document types are actually
indexed before attempting to access them, enabling graceful cross-app search.
"""
username: str
# App-specific clients that search algorithms dispatch to
@property
def notes(self) -> Any:
"""Notes client for accessing note documents."""
...
@property
def webdav(self) -> Any:
"""WebDAV client for accessing file documents."""
...
@property
def calendar(self) -> Any:
"""Calendar client for accessing event/task documents."""
...
@property
def contacts(self) -> Any:
"""Contacts client for accessing contact card documents."""
...
@property
def deck(self) -> Any:
"""Deck client for accessing deck card documents."""
...
@property
def cookbook(self) -> Any:
"""Cookbook client for accessing recipe documents."""
...
@property
def tables(self) -> Any:
"""Tables client for accessing table row documents."""
...
async def get_indexed_doc_types(user_id: str) -> set[str]:
"""Query Qdrant to get actually-indexed document types for a user.
This enables search algorithms to check which document types are available
before attempting to search/verify them, allowing graceful cross-app search.
Args:
user_id: User ID to filter by
Returns:
Set of document type strings (e.g., {"note", "file", "calendar"})
Example:
>>> types = await get_indexed_doc_types("alice")
>>> if "note" in types:
... # Search notes
"""
import logging
from qdrant_client.models import FieldCondition, Filter, MatchValue
from nextcloud_mcp_server.config import get_settings
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
logger = logging.getLogger(__name__)
settings = get_settings()
qdrant_client = await get_qdrant_client()
collection = settings.qdrant_collection
# Use scroll to sample documents and extract doc_types
# Note: This could be optimized with a facet/aggregation query if Qdrant adds support
try:
scroll_results, _next_offset = await qdrant_client.scroll(
collection_name=collection,
scroll_filter=Filter(
must=[FieldCondition(key="user_id", match=MatchValue(value=user_id))]
),
limit=1000, # Sample size to discover types
with_payload=["doc_type"],
with_vectors=False, # Don't need vectors for type discovery
)
doc_types = {
point.payload.get("doc_type")
for point in scroll_results
if point.payload.get("doc_type")
}
logger.debug(f"Found indexed document types for user {user_id}: {doc_types}")
return doc_types
except Exception as e:
logger.warning(f"Failed to query Qdrant for doc_types: {e}")
return set()
@dataclass
class SearchResult:
"""A single search result with metadata and score.
Attributes:
id: Document ID
doc_type: Document type (note, file, calendar, contact, etc.)
title: Document title
excerpt: Content excerpt showing match context
score: Relevance score (0.0-1.0, higher is better)
metadata: Additional algorithm-specific metadata
"""
id: int
doc_type: str
title: str
excerpt: str
score: float
metadata: dict[str, Any] | None = None
def __post_init__(self):
"""Validate score is in valid range."""
if not 0.0 <= self.score <= 1.0:
raise ValueError(f"Score must be between 0.0 and 1.0, got {self.score}")
class SearchAlgorithm(ABC):
"""Abstract base class for search algorithms.
All search algorithms must implement the search() method with consistent
interface, allowing them to be used interchangeably.
"""
@abstractmethod
async def search(
self,
query: str,
user_id: str,
limit: int = 10,
doc_type: str | None = None,
**kwargs: Any,
) -> list[SearchResult]:
"""Execute search with the given parameters.
Args:
query: Search query string
user_id: User ID for multi-tenant filtering
limit: Maximum number of results to return
doc_type: Optional document type filter (note, file, calendar, etc.)
**kwargs: Algorithm-specific parameters
Returns:
List of SearchResult objects ranked by relevance
Raises:
McpError: If search fails or configuration is invalid
"""
pass
@property
@abstractmethod
def name(self) -> str:
"""Return algorithm name for identification."""
pass
@property
def supports_scoring(self) -> bool:
"""Whether this algorithm provides meaningful relevance scores.
Default: True. Override if algorithm doesn't support scoring.
"""
return True
@property
def requires_vector_db(self) -> bool:
"""Whether this algorithm requires vector database.
Default: False. Override for semantic search.
"""
return False