b5b03bfd78
Implements NextcloudClientProtocol for multi-document type search following user requirement that document types are not 1:1 with apps (e.g., Notes app specializes in markdown, while Files/WebDAV handles multiple file types). Key Changes: - NextcloudClientProtocol: Generic protocol with app-specific client properties - get_indexed_doc_types(): Query Qdrant for actually-indexed document types - Document dispatch: All algorithms check Qdrant before attempting access - Cross-type deduplication: Use (doc_id, doc_type) tuples in hybrid RRF Search Algorithm Updates: - Semantic: Added _verify_document_access() with dispatch to appropriate client - Deduplication by (doc_id, doc_type) tuple - Only "note" verification implemented, others return None with info log - Keyword: Added _fetch_documents() dispatch method - Queries Qdrant for available types before fetching - Supports cross-app search when doc_type=None - Fuzzy: Same pattern as keyword search - Hybrid: Already uses (doc_id, doc_type) for deduplication (no changes needed) Future-Proof Design: - File/calendar verification stubs in place - Clear logging when unsupported types found - Easy to extend when processor indexes new document types Currently Supported: - "note" documents fully implemented and tested - Other types gracefully handled (logged but skipped) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
201 lines
6.0 KiB
Python
201 lines
6.0 KiB
Python
"""Base interfaces and data structures for search algorithms."""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from dataclasses import dataclass
|
|
from typing import Any, Protocol, runtime_checkable
|
|
|
|
|
|
@runtime_checkable
|
|
class NextcloudClientProtocol(Protocol):
|
|
"""Protocol for Nextcloud client supporting multi-document search.
|
|
|
|
This protocol defines the interface that search algorithms need from a
|
|
Nextcloud client to access documents across different apps (Notes, Files,
|
|
Calendar, etc.). The client provides access to app-specific sub-clients
|
|
that handle the actual API calls.
|
|
|
|
Document types (e.g., "note", "file", "calendar") are NOT 1:1 with apps.
|
|
For example, the Notes app specializes in markdown files, while Files/WebDAV
|
|
handles multiple file types. The abstraction is at the document type level.
|
|
|
|
Search algorithms query Qdrant to determine which document types are actually
|
|
indexed before attempting to access them, enabling graceful cross-app search.
|
|
"""
|
|
|
|
username: str
|
|
|
|
# App-specific clients that search algorithms dispatch to
|
|
@property
|
|
def notes(self) -> Any:
|
|
"""Notes client for accessing note documents."""
|
|
...
|
|
|
|
@property
|
|
def webdav(self) -> Any:
|
|
"""WebDAV client for accessing file documents."""
|
|
...
|
|
|
|
@property
|
|
def calendar(self) -> Any:
|
|
"""Calendar client for accessing event/task documents."""
|
|
...
|
|
|
|
@property
|
|
def contacts(self) -> Any:
|
|
"""Contacts client for accessing contact card documents."""
|
|
...
|
|
|
|
@property
|
|
def deck(self) -> Any:
|
|
"""Deck client for accessing deck card documents."""
|
|
...
|
|
|
|
@property
|
|
def cookbook(self) -> Any:
|
|
"""Cookbook client for accessing recipe documents."""
|
|
...
|
|
|
|
@property
|
|
def tables(self) -> Any:
|
|
"""Tables client for accessing table row documents."""
|
|
...
|
|
|
|
|
|
async def get_indexed_doc_types(user_id: str) -> set[str]:
|
|
"""Query Qdrant to get actually-indexed document types for a user.
|
|
|
|
This enables search algorithms to check which document types are available
|
|
before attempting to search/verify them, allowing graceful cross-app search.
|
|
|
|
Args:
|
|
user_id: User ID to filter by
|
|
|
|
Returns:
|
|
Set of document type strings (e.g., {"note", "file", "calendar"})
|
|
|
|
Example:
|
|
>>> types = await get_indexed_doc_types("alice")
|
|
>>> if "note" in types:
|
|
... # Search notes
|
|
"""
|
|
import logging
|
|
|
|
from qdrant_client.models import FieldCondition, Filter, MatchValue
|
|
|
|
from nextcloud_mcp_server.config import get_settings
|
|
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
|
|
|
|
logger = logging.getLogger(__name__)
|
|
settings = get_settings()
|
|
|
|
qdrant_client = await get_qdrant_client()
|
|
collection = settings.qdrant_collection
|
|
|
|
# Use scroll to sample documents and extract doc_types
|
|
# Note: This could be optimized with a facet/aggregation query if Qdrant adds support
|
|
try:
|
|
scroll_results, _next_offset = await qdrant_client.scroll(
|
|
collection_name=collection,
|
|
scroll_filter=Filter(
|
|
must=[FieldCondition(key="user_id", match=MatchValue(value=user_id))]
|
|
),
|
|
limit=1000, # Sample size to discover types
|
|
with_payload=["doc_type"],
|
|
with_vectors=False, # Don't need vectors for type discovery
|
|
)
|
|
|
|
doc_types = {
|
|
point.payload.get("doc_type")
|
|
for point in scroll_results
|
|
if point.payload.get("doc_type")
|
|
}
|
|
|
|
logger.debug(f"Found indexed document types for user {user_id}: {doc_types}")
|
|
return doc_types
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Failed to query Qdrant for doc_types: {e}")
|
|
return set()
|
|
|
|
|
|
@dataclass
|
|
class SearchResult:
|
|
"""A single search result with metadata and score.
|
|
|
|
Attributes:
|
|
id: Document ID
|
|
doc_type: Document type (note, file, calendar, contact, etc.)
|
|
title: Document title
|
|
excerpt: Content excerpt showing match context
|
|
score: Relevance score (0.0-1.0, higher is better)
|
|
metadata: Additional algorithm-specific metadata
|
|
"""
|
|
|
|
id: int
|
|
doc_type: str
|
|
title: str
|
|
excerpt: str
|
|
score: float
|
|
metadata: dict[str, Any] | None = None
|
|
|
|
def __post_init__(self):
|
|
"""Validate score is in valid range."""
|
|
if not 0.0 <= self.score <= 1.0:
|
|
raise ValueError(f"Score must be between 0.0 and 1.0, got {self.score}")
|
|
|
|
|
|
class SearchAlgorithm(ABC):
|
|
"""Abstract base class for search algorithms.
|
|
|
|
All search algorithms must implement the search() method with consistent
|
|
interface, allowing them to be used interchangeably.
|
|
"""
|
|
|
|
@abstractmethod
|
|
async def search(
|
|
self,
|
|
query: str,
|
|
user_id: str,
|
|
limit: int = 10,
|
|
doc_type: str | None = None,
|
|
**kwargs: Any,
|
|
) -> list[SearchResult]:
|
|
"""Execute search with the given parameters.
|
|
|
|
Args:
|
|
query: Search query string
|
|
user_id: User ID for multi-tenant filtering
|
|
limit: Maximum number of results to return
|
|
doc_type: Optional document type filter (note, file, calendar, etc.)
|
|
**kwargs: Algorithm-specific parameters
|
|
|
|
Returns:
|
|
List of SearchResult objects ranked by relevance
|
|
|
|
Raises:
|
|
McpError: If search fails or configuration is invalid
|
|
"""
|
|
pass
|
|
|
|
@property
|
|
@abstractmethod
|
|
def name(self) -> str:
|
|
"""Return algorithm name for identification."""
|
|
pass
|
|
|
|
@property
|
|
def supports_scoring(self) -> bool:
|
|
"""Whether this algorithm provides meaningful relevance scores.
|
|
|
|
Default: True. Override if algorithm doesn't support scoring.
|
|
"""
|
|
return True
|
|
|
|
@property
|
|
def requires_vector_db(self) -> bool:
|
|
"""Whether this algorithm requires vector database.
|
|
|
|
Default: False. Override for semantic search.
|
|
"""
|
|
return False
|