diff --git a/nextcloud_mcp_server/search/fuzzy.py b/nextcloud_mcp_server/search/fuzzy.py index acd57d1..8dcb5db 100644 --- a/nextcloud_mcp_server/search/fuzzy.py +++ b/nextcloud_mcp_server/search/fuzzy.py @@ -1,14 +1,18 @@ -"""Fuzzy search algorithm using character overlap matching.""" +"""Fuzzy search algorithm using character overlap matching on Qdrant payload.""" import logging from typing import Any +from httpx import HTTPStatusError +from qdrant_client.models import FieldCondition, Filter, MatchValue + +from nextcloud_mcp_server.config import get_settings from nextcloud_mcp_server.search.algorithms import ( NextcloudClientProtocol, SearchAlgorithm, SearchResult, - get_indexed_doc_types, ) +from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client logger = logging.getLogger(__name__) @@ -45,25 +49,24 @@ class FuzzySearchAlgorithm(SearchAlgorithm): nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: - """Execute fuzzy search using character overlap. + """Execute fuzzy search using character overlap on Qdrant payload. + + Queries Qdrant for all indexed documents, then scores based on character + overlap in title and excerpt fields. Only verifies access with Nextcloud + at the end for security. Args: query: Search query user_id: User ID for filtering limit: Maximum results to return - doc_type: Optional document type filter (currently only "note" supported) - nextcloud_client: NextcloudClient for fetching documents + doc_type: Optional document type filter (None = all types) + nextcloud_client: NextcloudClient for access verification (optional) **kwargs: Additional parameters (threshold override) Returns: List of SearchResult objects ranked by character overlap score - - Raises: - ValueError: If nextcloud_client not provided """ - if not nextcloud_client: - raise ValueError("FuzzySearch requires nextcloud_client parameter") - + settings = get_settings() threshold = kwargs.get("threshold", self.threshold) logger.info( @@ -71,112 +74,176 @@ class FuzzySearchAlgorithm(SearchAlgorithm): f"limit={limit}, threshold={threshold}, doc_type={doc_type}" ) - # Get available document types from Qdrant - indexed_types = await get_indexed_doc_types(user_id) - logger.debug(f"Indexed document types for user: {indexed_types}") - - # Determine which types to search + # Build Qdrant filter + filter_conditions = [ + FieldCondition(key="user_id", match=MatchValue(value=user_id)) + ] if doc_type: - # Search specific type if requested - search_types = [doc_type] if doc_type in indexed_types else [] - if not search_types: - logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}") - return [] - else: - # Search all indexed types - search_types = list(indexed_types) + filter_conditions.append( + FieldCondition(key="doc_type", match=MatchValue(value=doc_type)) + ) - # Fetch documents for each type and score them - all_documents = [] - for dtype in search_types: - documents = await self._fetch_documents(nextcloud_client, dtype) - for doc in documents: - doc["_doc_type"] = dtype # Tag with type - all_documents.extend(documents) + # Scroll through Qdrant to get all matching documents + qdrant_client = await get_qdrant_client() + collection = settings.qdrant_collection - logger.debug(f"Fetched {len(all_documents)} total documents for fuzzy search") + all_points = [] + offset = None - # Score and filter documents + # Scroll through all points matching filter + while True: + scroll_result, next_offset = await qdrant_client.scroll( + collection_name=collection, + scroll_filter=Filter(must=filter_conditions), + limit=100, # Batch size + offset=offset, + with_payload=["doc_id", "doc_type", "title", "excerpt", "chunk_index"], + with_vectors=False, # Don't need vectors + ) + + all_points.extend(scroll_result) + + if next_offset is None: + break + offset = next_offset + + logger.debug(f"Retrieved {len(all_points)} points from Qdrant for fuzzy search") + + # Deduplicate by (doc_id, doc_type) - keep first chunk + seen_docs = {} + for point in all_points: + doc_id = int(point.payload["doc_id"]) + dtype = point.payload.get("doc_type", "note") + doc_key = (doc_id, dtype) + + chunk_idx = point.payload.get("chunk_index", 0) + if doc_key not in seen_docs or chunk_idx == 0: + seen_docs[doc_key] = point + + logger.debug(f"Deduplicated to {len(seen_docs)} unique documents") + + # Score each document based on fuzzy matches scored_results = [] query_lower = query.lower() - for doc in all_documents: - dtype = doc.get("_doc_type", "note") - title = doc.get("title", "") - content = doc.get("content", "") + for doc_key, point in seen_docs.items(): + doc_id, dtype = doc_key + title = point.payload.get("title", "") + excerpt = point.payload.get("excerpt", "") # Check title match title_score = self._calculate_char_overlap(query_lower, title.lower()) - # Check content match - content_score = self._calculate_char_overlap(query_lower, content.lower()) + # Check excerpt match + excerpt_score = self._calculate_char_overlap(query_lower, excerpt.lower()) # Use best score - best_score = max(title_score, content_score) + best_score = max(title_score, excerpt_score) if best_score >= threshold: - # Extract excerpt based on which matched better - if title_score >= content_score: - excerpt = f"Title match: {title}" - else: - excerpt = self._extract_excerpt(content, max_length=200) - + match_location = "title" if title_score >= excerpt_score else "excerpt" scored_results.append( - SearchResult( - id=doc["id"], - doc_type=dtype, - title=title or "Untitled", - excerpt=excerpt, - score=best_score, - metadata={ - "category": doc.get("category", ""), - "modified": doc.get("modified"), - "match_location": "title" - if title_score >= content_score - else "content", - }, - ) + { + "doc_id": doc_id, + "doc_type": dtype, + "title": title, + "excerpt": excerpt + if excerpt_score >= title_score + else f"Title match: {title}", + "score": best_score, + "match_location": match_location, + } ) # Sort by score (descending) and limit - scored_results.sort(key=lambda x: x.score, reverse=True) - results = scored_results[:limit] + scored_results.sort(key=lambda x: x["score"], reverse=True) + top_results = scored_results[: limit * 2] # Get extra for access verification - logger.info(f"Fuzzy search returned {len(results)} matching notes") - if results: + # Verify access with Nextcloud (optional, for security) + final_results = [] + if nextcloud_client: + for result in top_results: + verified = await self._verify_access( + nextcloud_client, result["doc_id"], result["doc_type"] + ) + if verified: + final_results.append( + SearchResult( + id=result["doc_id"], + doc_type=result["doc_type"], + title=result["title"], + excerpt=result["excerpt"], + score=result["score"], + metadata={ + **verified.get("metadata", {}), + "match_location": result["match_location"], + }, + ) + ) + if len(final_results) >= limit: + break + else: + # No verification, return results directly + for result in top_results[:limit]: + final_results.append( + SearchResult( + id=result["doc_id"], + doc_type=result["doc_type"], + title=result["title"], + excerpt=result["excerpt"], + score=result["score"], + metadata={"match_location": result["match_location"]}, + ) + ) + + logger.info(f"Fuzzy search returned {len(final_results)} matching documents") + if final_results: result_details = [ - f"note_{r.id} (score={r.score:.3f}, title='{r.title}')" - for r in results[:5] + f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')" + for r in final_results[:5] ] logger.debug(f"Top fuzzy results: {', '.join(result_details)}") - return results + return final_results - async def _fetch_documents( - self, nextcloud_client: NextcloudClientProtocol, doc_type: str - ) -> list[dict[str, Any]]: - """Fetch documents of a specific type from Nextcloud. + async def _verify_access( + self, nextcloud_client: NextcloudClientProtocol, doc_id: int, doc_type: str + ) -> dict[str, Any] | None: + """Verify user has access to a document via Nextcloud API. Args: nextcloud_client: Client for API access - doc_type: Document type to fetch ("note", "file", "calendar", etc.) + doc_id: Document ID + doc_type: Document type Returns: - List of document dictionaries with at minimum: id, title, content + Dict with metadata if access verified, None otherwise """ - if doc_type == "note": - return await nextcloud_client.notes.get_notes() - elif doc_type == "file": - # Future: fetch files when indexed - logger.info("File documents not yet supported for fuzzy search") - return [] - elif doc_type == "calendar": - # Future: fetch calendar events when indexed - logger.info("Calendar documents not yet supported for fuzzy search") - return [] - else: - logger.warning(f"Unknown document type '{doc_type}' for fuzzy search") - return [] + try: + if doc_type == "note": + note = await nextcloud_client.notes.get_note(doc_id) + return { + "metadata": { + "category": note.get("category", ""), + "modified": note.get("modified"), + } + } + else: + logger.debug( + f"Skipping verification for {doc_type} {doc_id} (not implemented)" + ) + return {"metadata": {}} + except HTTPStatusError as e: + if e.response.status_code in (403, 404): + logger.debug( + f"Access denied for {doc_type} {doc_id}: {e.response.status_code}" + ) + return None + else: + logger.warning( + f"Error verifying {doc_type} {doc_id}: {e.response.status_code}" + ) + return None def _calculate_char_overlap(self, query: str, text: str) -> float: """Calculate character overlap ratio between query and text. diff --git a/nextcloud_mcp_server/search/keyword.py b/nextcloud_mcp_server/search/keyword.py index d4e8002..c98503b 100644 --- a/nextcloud_mcp_server/search/keyword.py +++ b/nextcloud_mcp_server/search/keyword.py @@ -1,14 +1,18 @@ -"""Keyword search algorithm using token-based matching (ADR-001).""" +"""Keyword search algorithm using token-based matching on Qdrant payload (ADR-001).""" import logging from typing import Any +from httpx import HTTPStatusError +from qdrant_client.models import FieldCondition, Filter, MatchValue + +from nextcloud_mcp_server.config import get_settings from nextcloud_mcp_server.search.algorithms import ( NextcloudClientProtocol, SearchAlgorithm, SearchResult, - get_indexed_doc_types, ) +from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client logger = logging.getLogger(__name__) @@ -39,24 +43,24 @@ class KeywordSearchAlgorithm(SearchAlgorithm): nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: - """Execute keyword search using token matching. + """Execute keyword search using token matching on Qdrant payload. + + Queries Qdrant for all indexed documents, then scores based on token + matches in title and excerpt fields. Only verifies access with Nextcloud + at the end for security. Args: query: Search query to tokenize and match user_id: User ID for filtering limit: Maximum results to return - doc_type: Optional document type filter (currently only "note" supported) - nextcloud_client: NextcloudClient for fetching documents + doc_type: Optional document type filter (None = all types) + nextcloud_client: NextcloudClient for access verification (optional) **kwargs: Additional parameters (unused) Returns: List of SearchResult objects ranked by keyword match score - - Raises: - ValueError: If nextcloud_client not provided """ - if not nextcloud_client: - raise ValueError("KeywordSearch requires nextcloud_client parameter") + settings = get_settings() logger.info( f"Keyword search: query='{query}', user={user_id}, " @@ -67,102 +71,173 @@ class KeywordSearchAlgorithm(SearchAlgorithm): query_tokens = self._process_query(query) logger.debug(f"Query tokens: {query_tokens}") - # Get available document types from Qdrant - indexed_types = await get_indexed_doc_types(user_id) - logger.debug(f"Indexed document types for user: {indexed_types}") - - # Determine which types to search + # Build Qdrant filter + filter_conditions = [ + FieldCondition(key="user_id", match=MatchValue(value=user_id)) + ] if doc_type: - # Search specific type if requested - search_types = [doc_type] if doc_type in indexed_types else [] - if not search_types: - logger.info(f"Doc type '{doc_type}' not indexed for user {user_id}") - return [] - else: - # Search all indexed types - search_types = list(indexed_types) - - # Fetch documents for each type and score them - all_documents = [] - for dtype in search_types: - documents = await self._fetch_documents(nextcloud_client, dtype) - for doc in documents: - doc["_doc_type"] = dtype # Tag with type - all_documents.extend(documents) - - logger.debug(f"Fetched {len(all_documents)} total documents for keyword search") - - # Score and filter documents - scored_results = [] - for doc in all_documents: - dtype = doc.get("_doc_type", "note") - score = self._calculate_score( - query_tokens, - doc.get("title", ""), - doc.get("content", ""), + filter_conditions.append( + FieldCondition(key="doc_type", match=MatchValue(value=doc_type)) ) - if score > 0: # Only include matches - # Extract excerpt with context - excerpt = self._extract_excerpt( - doc.get("content", ""), - query_tokens, - max_length=200, - ) + # Scroll through Qdrant to get all matching documents + # We need title and excerpt from payload for token matching + qdrant_client = await get_qdrant_client() + collection = settings.qdrant_collection + all_points = [] + offset = None + + # Scroll through all points matching filter + while True: + scroll_result, next_offset = await qdrant_client.scroll( + collection_name=collection, + scroll_filter=Filter(must=filter_conditions), + limit=100, # Batch size + offset=offset, + with_payload=[ + "doc_id", + "doc_type", + "title", + "excerpt", + "chunk_index", + "total_chunks", + ], + with_vectors=False, # Don't need vectors for keyword search + ) + + all_points.extend(scroll_result) + + if next_offset is None: + break + offset = next_offset + + logger.debug( + f"Retrieved {len(all_points)} points from Qdrant for keyword search" + ) + + # Deduplicate by (doc_id, doc_type) - keep best chunk per document + seen_docs = {} + for point in all_points: + doc_id = int(point.payload["doc_id"]) + dtype = point.payload.get("doc_type", "note") + doc_key = (doc_id, dtype) + + # Keep first chunk (chunk_index=0) as it has the most relevant content + chunk_idx = point.payload.get("chunk_index", 0) + if doc_key not in seen_docs or chunk_idx == 0: + seen_docs[doc_key] = point + + logger.debug(f"Deduplicated to {len(seen_docs)} unique documents") + + # Score each document based on keyword matches + scored_results = [] + for doc_key, point in seen_docs.items(): + doc_id, dtype = doc_key + title = point.payload.get("title", "") + excerpt = point.payload.get("excerpt", "") + + # Calculate keyword match score + score = self._calculate_score(query_tokens, title, excerpt) + + if score > 0: # Only include matches scored_results.append( - SearchResult( - id=doc["id"], - doc_type=dtype, - title=doc.get("title", "Untitled"), - excerpt=excerpt, - score=score, - metadata={ - "category": doc.get("category", ""), - "modified": doc.get("modified"), - }, - ) + { + "doc_id": doc_id, + "doc_type": dtype, + "title": title, + "excerpt": excerpt, + "score": score, + } ) # Sort by score (descending) and limit - scored_results.sort(key=lambda x: x.score, reverse=True) - results = scored_results[:limit] + scored_results.sort(key=lambda x: x["score"], reverse=True) + top_results = scored_results[: limit * 2] # Get extra for access verification - logger.info(f"Keyword search returned {len(results)} matching notes") - if results: + # Verify access with Nextcloud (optional, for security) + final_results = [] + if nextcloud_client: + for result in top_results: + verified = await self._verify_access( + nextcloud_client, result["doc_id"], result["doc_type"] + ) + if verified: + final_results.append( + SearchResult( + id=result["doc_id"], + doc_type=result["doc_type"], + title=result["title"], + excerpt=result["excerpt"], + score=result["score"], + metadata=verified.get("metadata", {}), + ) + ) + if len(final_results) >= limit: + break + else: + # No verification, return results directly + for result in top_results[:limit]: + final_results.append( + SearchResult( + id=result["doc_id"], + doc_type=result["doc_type"], + title=result["title"], + excerpt=result["excerpt"], + score=result["score"], + metadata={}, + ) + ) + + logger.info(f"Keyword search returned {len(final_results)} matching documents") + if final_results: result_details = [ - f"note_{r.id} (score={r.score:.3f}, title='{r.title}')" - for r in results[:5] + f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')" + for r in final_results[:5] ] logger.debug(f"Top keyword results: {', '.join(result_details)}") - return results + return final_results - async def _fetch_documents( - self, nextcloud_client: NextcloudClientProtocol, doc_type: str - ) -> list[dict[str, Any]]: - """Fetch documents of a specific type from Nextcloud. + async def _verify_access( + self, nextcloud_client: NextcloudClientProtocol, doc_id: int, doc_type: str + ) -> dict[str, Any] | None: + """Verify user has access to a document via Nextcloud API. Args: nextcloud_client: Client for API access - doc_type: Document type to fetch ("note", "file", "calendar", etc.) + doc_id: Document ID + doc_type: Document type Returns: - List of document dictionaries with at minimum: id, title, content + Dict with metadata if access verified, None otherwise """ - if doc_type == "note": - return await nextcloud_client.notes.get_notes() - elif doc_type == "file": - # Future: fetch files when indexed - logger.info("File documents not yet supported for keyword search") - return [] - elif doc_type == "calendar": - # Future: fetch calendar events when indexed - logger.info("Calendar documents not yet supported for keyword search") - return [] - else: - logger.warning(f"Unknown document type '{doc_type}' for keyword search") - return [] + try: + if doc_type == "note": + note = await nextcloud_client.notes.get_note(doc_id) + return { + "metadata": { + "category": note.get("category", ""), + "modified": note.get("modified"), + } + } + # Future: Add verification for other document types + else: + logger.debug( + f"Skipping verification for {doc_type} {doc_id} (not implemented)" + ) + return {"metadata": {}} + except HTTPStatusError as e: + if e.response.status_code in (403, 404): + logger.debug( + f"Access denied for {doc_type} {doc_id}: {e.response.status_code}" + ) + return None + else: + logger.warning( + f"Error verifying {doc_type} {doc_id}: {e.response.status_code}" + ) + return None def _process_query(self, query: str) -> list[str]: """Tokenize and normalize query. diff --git a/nextcloud_mcp_server/server/semantic.py b/nextcloud_mcp_server/server/semantic.py index d0a3109..d000b11 100644 --- a/nextcloud_mcp_server/server/semantic.py +++ b/nextcloud_mcp_server/server/semantic.py @@ -45,6 +45,7 @@ def configure_semantic_tools(mcp: FastMCP): query: str, ctx: Context, limit: int = 10, + doc_types: list[str] | None = None, score_threshold: float = 0.7, algorithm: Literal["semantic", "keyword", "fuzzy", "hybrid"] = "hybrid", semantic_weight: float = 0.5, @@ -52,7 +53,7 @@ def configure_semantic_tools(mcp: FastMCP): fuzzy_weight: float = 0.2, ) -> SemanticSearchResponse: """ - Search Nextcloud content using configurable algorithms. + Search Nextcloud content using configurable algorithms with cross-app support. Supports multiple search algorithms with client-configurable weighting: - semantic: Vector similarity search (requires VECTOR_SYNC_ENABLED=true) @@ -60,9 +61,13 @@ def configure_semantic_tools(mcp: FastMCP): - fuzzy: Character overlap matching (typo-tolerant) - hybrid: Combines all algorithms using Reciprocal Rank Fusion (default) + Document types are queried from the vector database to determine what's + actually indexed. Currently only "note" documents are fully supported. + Args: query: Natural language search query limit: Maximum number of results to return (default: 10) + doc_types: Document types to search (e.g., ["note", "file"]). None = search all indexed types (default) score_threshold: Minimum similarity score for semantic/hybrid (0-1, default: 0.7) algorithm: Search algorithm to use (default: "hybrid") semantic_weight: Weight for semantic results in hybrid mode (default: 0.5) @@ -116,15 +121,42 @@ def configure_semantic_tools(mcp: FastMCP): ErrorData(code=-1, message=f"Unknown algorithm: {algorithm}") ) - # Execute search (currently limited to notes doc_type) - search_results = await search_algo.search( - query=query, - user_id=username, - limit=limit, - doc_type="note", - nextcloud_client=client, - score_threshold=score_threshold, - ) + # Execute search across requested document types + # If doc_types is None, search all indexed types (cross-app search) + # If doc_types is a list, search only those types + all_results = [] + + if doc_types is None: + # Cross-app search: search all indexed types + # Pass None to search algorithm to let it query Qdrant for available types + search_results = await search_algo.search( + query=query, + user_id=username, + limit=limit, + doc_type=None, # Signal to search all types + nextcloud_client=client, + score_threshold=score_threshold, + ) + all_results.extend(search_results) + else: + # Search specific document types + # For each requested type, execute search and combine results + for dtype in doc_types: + search_results = await search_algo.search( + query=query, + user_id=username, + limit=limit * 2, # Get extra for combining + doc_type=dtype, + nextcloud_client=client, + score_threshold=score_threshold, + ) + all_results.extend(search_results) + + # Sort combined results by score and limit + all_results.sort(key=lambda r: r.score, reverse=True) + all_results = all_results[:limit] + + search_results = all_results # Convert SearchResult objects to SemanticSearchResult for response results = []