diff --git a/nextcloud_mcp_server/auth/viz_routes.py b/nextcloud_mcp_server/auth/viz_routes.py index f68fd53..20eaf1c 100644 --- a/nextcloud_mcp_server/auth/viz_routes.py +++ b/nextcloud_mcp_server/auth/viz_routes.py @@ -450,43 +450,39 @@ async def vector_visualization_search(request: Request) -> JSONResponse: ) # Execute search (supports cross-app when doc_types=None) + # Get unverified results with buffer for filtering + all_results = [] if doc_types is None or len(doc_types) == 0: # Cross-app search - search all indexed types - search_results = await search_algo.search( + unverified_results = await search_algo.search( query=query, user_id=username, - limit=limit, + limit=limit * 2, # Buffer for verification filtering doc_type=None, # Search all types - nextcloud_client=nextcloud_client, - score_threshold=score_threshold, - ) - elif len(doc_types) == 1: - # Single document type - search_results = await search_algo.search( - query=query, - user_id=username, - limit=limit, - doc_type=doc_types[0], - nextcloud_client=nextcloud_client, score_threshold=score_threshold, ) + all_results.extend(unverified_results) else: - # Multiple document types - search each and combine - all_results = [] + # Search each document type and combine for doc_type in doc_types: - results = await search_algo.search( + unverified_results = await search_algo.search( query=query, user_id=username, - limit=limit * 2, # Get extra per type + limit=limit * 2, # Buffer for verification filtering doc_type=doc_type, - nextcloud_client=nextcloud_client, score_threshold=score_threshold, ) - all_results.extend(results) - - # Sort by score and limit + all_results.extend(unverified_results) + # Sort by score before verification all_results.sort(key=lambda r: r.score, reverse=True) - search_results = all_results[:limit] + + # Verify access for all results (deduplicates and filters) + from nextcloud_mcp_server.search.verification import verify_search_results + + verified_results = await verify_search_results( + all_results, nextcloud_client + ) + search_results = verified_results[:limit] if not search_results: return JSONResponse( diff --git a/nextcloud_mcp_server/search/fuzzy.py b/nextcloud_mcp_server/search/fuzzy.py index 9cca22e..ff7a6b2 100644 --- a/nextcloud_mcp_server/search/fuzzy.py +++ b/nextcloud_mcp_server/search/fuzzy.py @@ -3,15 +3,10 @@ import logging from typing import Any -from httpx import HTTPStatusError from qdrant_client.models import FieldCondition, Filter, MatchValue from nextcloud_mcp_server.config import get_settings -from nextcloud_mcp_server.search.algorithms import ( - NextcloudClientProtocol, - SearchAlgorithm, - SearchResult, -) +from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client logger = logging.getLogger(__name__) @@ -46,25 +41,23 @@ class FuzzySearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute fuzzy search using character overlap on Qdrant payload. Queries Qdrant for all indexed documents, then scores based on character - overlap in title and excerpt fields. Only verifies access with Nextcloud - at the end for security. + overlap in title and excerpt fields. Returns unverified results - access + verification should be performed separately at the final output stage. Args: query: Search query user_id: User ID for filtering limit: Maximum results to return doc_type: Optional document type filter (None = all types) - nextcloud_client: NextcloudClient for access verification (optional) **kwargs: Additional parameters (threshold override) Returns: - List of SearchResult objects ranked by character overlap score + List of unverified SearchResult objects ranked by character overlap score """ settings = get_settings() threshold = kwargs.get("threshold", self.threshold) @@ -157,66 +150,23 @@ class FuzzySearchAlgorithm(SearchAlgorithm): # Sort by score (descending) and limit scored_results.sort(key=lambda x: x["score"], reverse=True) - top_results = scored_results[: limit * 2] # Get extra for access verification + top_results = scored_results[:limit] - # Verify access with Nextcloud (optional, for security) - # Parallelize verification to avoid sequential HTTP calls + # Return unverified results (verification happens at output stage) final_results = [] - if nextcloud_client: - from asyncio import gather - - # Create verification coroutines for all top results - verification_coros = [ - self._verify_access( - nextcloud_client, result["doc_id"], result["doc_type"] + for result in top_results: + final_results.append( + SearchResult( + id=result["doc_id"], + doc_type=result["doc_type"], + title=result["title"], + excerpt=result["excerpt"], + score=result["score"], + metadata={"match_location": result["match_location"]}, ) - for result in top_results - ] - - # Execute all verifications in parallel - # return_exceptions=True prevents one failure from canceling others - verification_results = await gather( - *verification_coros, return_exceptions=True ) - # Build final results from verified documents - for result, verified in zip(top_results, verification_results): - # Skip if verification failed or raised exception - if isinstance(verified, Exception) or verified is None: - continue - - final_results.append( - SearchResult( - id=result["doc_id"], - doc_type=result["doc_type"], - title=result["title"], - excerpt=result["excerpt"], - score=result["score"], - metadata={ - **verified.get("metadata", {}), - "match_location": result["match_location"], - }, - ) - ) - - # Stop once we have enough results - if len(final_results) >= limit: - break - else: - # No verification, return results directly - for result in top_results[:limit]: - final_results.append( - SearchResult( - id=result["doc_id"], - doc_type=result["doc_type"], - title=result["title"], - excerpt=result["excerpt"], - score=result["score"], - metadata={"match_location": result["match_location"]}, - ) - ) - - logger.info(f"Fuzzy search returned {len(final_results)} matching documents") + logger.info(f"Fuzzy search returned {len(final_results)} unverified results") if final_results: result_details = [ f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')" @@ -226,45 +176,6 @@ class FuzzySearchAlgorithm(SearchAlgorithm): return final_results - async def _verify_access( - self, nextcloud_client: NextcloudClientProtocol, doc_id: int, doc_type: str - ) -> dict[str, Any] | None: - """Verify user has access to a document via Nextcloud API. - - Args: - nextcloud_client: Client for API access - doc_id: Document ID - doc_type: Document type - - Returns: - Dict with metadata if access verified, None otherwise - """ - try: - if doc_type == "note": - note = await nextcloud_client.notes.get_note(doc_id) - return { - "metadata": { - "category": note.get("category", ""), - "modified": note.get("modified"), - } - } - else: - logger.debug( - f"Skipping verification for {doc_type} {doc_id} (not implemented)" - ) - return {"metadata": {}} - except HTTPStatusError as e: - if e.response.status_code in (403, 404): - logger.debug( - f"Access denied for {doc_type} {doc_id}: {e.response.status_code}" - ) - return None - else: - logger.warning( - f"Error verifying {doc_type} {doc_id}: {e.response.status_code}" - ) - return None - def _calculate_char_overlap(self, query: str, text: str) -> float: """Calculate character overlap ratio between query and text. diff --git a/nextcloud_mcp_server/search/hybrid.py b/nextcloud_mcp_server/search/hybrid.py index 947f1f6..f7c3849 100644 --- a/nextcloud_mcp_server/search/hybrid.py +++ b/nextcloud_mcp_server/search/hybrid.py @@ -5,11 +5,7 @@ import logging from collections import defaultdict from typing import Any -from nextcloud_mcp_server.search.algorithms import ( - NextcloudClientProtocol, - SearchAlgorithm, - SearchResult, -) +from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult from nextcloud_mcp_server.search.fuzzy import FuzzySearchAlgorithm from nextcloud_mcp_server.search.keyword import KeywordSearchAlgorithm from nextcloud_mcp_server.search.semantic import SemanticSearchAlgorithm @@ -85,24 +81,22 @@ class HybridSearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute hybrid search using RRF to combine algorithms. + Returns unverified results from combined algorithms. Access verification + should be performed separately at the final output stage. + Args: query: Search query user_id: User ID for filtering limit: Maximum results to return doc_type: Optional document type filter - nextcloud_client: NextcloudClient for document access **kwargs: Additional parameters passed to sub-algorithms Returns: - List of SearchResult objects ranked by RRF combined score - - Raises: - ValueError: If nextcloud_client not provided (needed for keyword/fuzzy) + List of unverified SearchResult objects ranked by RRF combined score """ logger.info( f"Hybrid search: query='{query}', user={user_id}, limit={limit}, " @@ -116,29 +110,19 @@ class HybridSearchAlgorithm(SearchAlgorithm): if self.semantic_weight > 0: tasks.append( - self.semantic.search( - query, user_id, limit * 2, doc_type, nextcloud_client, **kwargs - ) + self.semantic.search(query, user_id, limit * 2, doc_type, **kwargs) ) algo_names.append("semantic") if self.keyword_weight > 0: - if not nextcloud_client: - raise ValueError("Hybrid search with keyword requires nextcloud_client") tasks.append( - self.keyword.search( - query, user_id, limit * 2, doc_type, nextcloud_client, **kwargs - ) + self.keyword.search(query, user_id, limit * 2, doc_type, **kwargs) ) algo_names.append("keyword") if self.fuzzy_weight > 0: - if not nextcloud_client: - raise ValueError("Hybrid search with fuzzy requires nextcloud_client") tasks.append( - self.fuzzy.search( - query, user_id, limit * 2, doc_type, nextcloud_client, **kwargs - ) + self.fuzzy.search(query, user_id, limit * 2, doc_type, **kwargs) ) algo_names.append("fuzzy") diff --git a/nextcloud_mcp_server/search/keyword.py b/nextcloud_mcp_server/search/keyword.py index 3f24295..6028478 100644 --- a/nextcloud_mcp_server/search/keyword.py +++ b/nextcloud_mcp_server/search/keyword.py @@ -3,15 +3,10 @@ import logging from typing import Any -from httpx import HTTPStatusError from qdrant_client.models import FieldCondition, Filter, MatchValue from nextcloud_mcp_server.config import get_settings -from nextcloud_mcp_server.search.algorithms import ( - NextcloudClientProtocol, - SearchAlgorithm, - SearchResult, -) +from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client logger = logging.getLogger(__name__) @@ -40,25 +35,23 @@ class KeywordSearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute keyword search using token matching on Qdrant payload. Queries Qdrant for all indexed documents, then scores based on token - matches in title and excerpt fields. Only verifies access with Nextcloud - at the end for security. + matches in title and excerpt fields. Returns unverified results - access + verification should be performed separately at the final output stage. Args: query: Search query to tokenize and match user_id: User ID for filtering limit: Maximum results to return doc_type: Optional document type filter (None = all types) - nextcloud_client: NextcloudClient for access verification (optional) **kwargs: Additional parameters (unused) Returns: - List of SearchResult objects ranked by keyword match score + List of unverified SearchResult objects ranked by keyword match score """ settings = get_settings() @@ -153,63 +146,23 @@ class KeywordSearchAlgorithm(SearchAlgorithm): # Sort by score (descending) and limit scored_results.sort(key=lambda x: x["score"], reverse=True) - top_results = scored_results[: limit * 2] # Get extra for access verification + top_results = scored_results[:limit] - # Verify access with Nextcloud (optional, for security) - # Parallelize verification to avoid sequential HTTP calls + # Return unverified results (verification happens at output stage) final_results = [] - if nextcloud_client: - from asyncio import gather - - # Create verification coroutines for all top results - verification_coros = [ - self._verify_access( - nextcloud_client, result["doc_id"], result["doc_type"] + for result in top_results: + final_results.append( + SearchResult( + id=result["doc_id"], + doc_type=result["doc_type"], + title=result["title"], + excerpt=result["excerpt"], + score=result["score"], + metadata={}, ) - for result in top_results - ] - - # Execute all verifications in parallel - # return_exceptions=True prevents one failure from canceling others - verification_results = await gather( - *verification_coros, return_exceptions=True ) - # Build final results from verified documents - for result, verified in zip(top_results, verification_results): - # Skip if verification failed or raised exception - if isinstance(verified, Exception) or verified is None: - continue - - final_results.append( - SearchResult( - id=result["doc_id"], - doc_type=result["doc_type"], - title=result["title"], - excerpt=result["excerpt"], - score=result["score"], - metadata=verified.get("metadata", {}), - ) - ) - - # Stop once we have enough results - if len(final_results) >= limit: - break - else: - # No verification, return results directly - for result in top_results[:limit]: - final_results.append( - SearchResult( - id=result["doc_id"], - doc_type=result["doc_type"], - title=result["title"], - excerpt=result["excerpt"], - score=result["score"], - metadata={}, - ) - ) - - logger.info(f"Keyword search returned {len(final_results)} matching documents") + logger.info(f"Keyword search returned {len(final_results)} unverified results") if final_results: result_details = [ f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')" @@ -219,46 +172,6 @@ class KeywordSearchAlgorithm(SearchAlgorithm): return final_results - async def _verify_access( - self, nextcloud_client: NextcloudClientProtocol, doc_id: int, doc_type: str - ) -> dict[str, Any] | None: - """Verify user has access to a document via Nextcloud API. - - Args: - nextcloud_client: Client for API access - doc_id: Document ID - doc_type: Document type - - Returns: - Dict with metadata if access verified, None otherwise - """ - try: - if doc_type == "note": - note = await nextcloud_client.notes.get_note(doc_id) - return { - "metadata": { - "category": note.get("category", ""), - "modified": note.get("modified"), - } - } - # Future: Add verification for other document types - else: - logger.debug( - f"Skipping verification for {doc_type} {doc_id} (not implemented)" - ) - return {"metadata": {}} - except HTTPStatusError as e: - if e.response.status_code in (403, 404): - logger.debug( - f"Access denied for {doc_type} {doc_id}: {e.response.status_code}" - ) - return None - else: - logger.warning( - f"Error verifying {doc_type} {doc_id}: {e.response.status_code}" - ) - return None - def _process_query(self, query: str) -> list[str]: """Tokenize and normalize query. diff --git a/nextcloud_mcp_server/search/semantic.py b/nextcloud_mcp_server/search/semantic.py index e38b16d..d770bf7 100644 --- a/nextcloud_mcp_server/search/semantic.py +++ b/nextcloud_mcp_server/search/semantic.py @@ -3,17 +3,12 @@ import logging from typing import Any -from httpx import HTTPStatusError from qdrant_client.models import FieldCondition, Filter, MatchValue from nextcloud_mcp_server.config import get_settings from nextcloud_mcp_server.embedding import get_embedding_service from nextcloud_mcp_server.observability.metrics import record_qdrant_operation -from nextcloud_mcp_server.search.algorithms import ( - NextcloudClientProtocol, - SearchAlgorithm, - SearchResult, -) +from nextcloud_mcp_server.search.algorithms import SearchAlgorithm, SearchResult from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client logger = logging.getLogger(__name__) @@ -48,21 +43,22 @@ class SemanticSearchAlgorithm(SearchAlgorithm): user_id: str, limit: int = 10, doc_type: str | None = None, - nextcloud_client: NextcloudClientProtocol | None = None, **kwargs: Any, ) -> list[SearchResult]: """Execute semantic search using vector similarity. + Returns unverified results from Qdrant. Access verification should be + performed separately at the final output stage using verify_search_results(). + Args: query: Natural language search query user_id: User ID for filtering limit: Maximum results to return - doc_type: Optional document type filter (currently only "note" supported) - nextcloud_client: NextcloudClient for access verification + doc_type: Optional document type filter **kwargs: Additional parameters (score_threshold override) Returns: - List of SearchResult objects ranked by similarity score + List of unverified SearchResult objects ranked by similarity score Raises: McpError: If vector sync is not enabled or search fails @@ -118,7 +114,7 @@ class SemanticSearchAlgorithm(SearchAlgorithm): logger.info( f"Qdrant returned {len(search_response.points)} results " - f"(before deduplication and access verification)" + f"(before deduplication)" ) if search_response.points: @@ -126,47 +122,11 @@ class SemanticSearchAlgorithm(SearchAlgorithm): top_scores = [p.score for p in search_response.points[:3]] logger.debug(f"Top 3 similarity scores: {top_scores}") - # Deduplicate by document ID (multiple chunks per document) - results = await self._deduplicate_and_verify( - search_response.points, limit, nextcloud_client - ) - - logger.info( - f"Returning {len(results)} results after deduplication and access verification" - ) - if results: - result_details = [ - f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')" - for r in results[:5] # Show top 5 - ] - logger.debug(f"Top results: {', '.join(result_details)}") - - return results - - async def _deduplicate_and_verify( - self, - points: list[Any], - limit: int, - nextcloud_client: NextcloudClientProtocol | None, - ) -> list[SearchResult]: - """Deduplicate results by (doc_id, doc_type) and verify access. - - Supports multiple document types with dispatch to appropriate client methods. - Deduplication is now by (doc_id, doc_type) tuple to handle cases where - the same ID might exist across different document types. - - Args: - points: Qdrant search results - limit: Maximum results to return - nextcloud_client: NextcloudClient for access verification (optional) - - Returns: - List of SearchResult objects - """ - seen_docs = set() # Track (doc_id, doc_type) tuples + # Deduplicate by (doc_id, doc_type) - multiple chunks per document + seen_docs = set() results = [] - for result in points: + for result in search_response.points: doc_id = int(result.payload["doc_id"]) doc_type = result.payload.get("doc_type", "note") doc_key = (doc_id, doc_type) @@ -177,99 +137,30 @@ class SemanticSearchAlgorithm(SearchAlgorithm): seen_docs.add(doc_key) - # Verify access via Nextcloud API if client provided - # Dispatch to appropriate client based on doc_type - verified_result = None - - if nextcloud_client: - verified_result = await self._verify_document_access( - nextcloud_client, doc_id, doc_type, result - ) - - if verified_result: - results.append(verified_result) - elif not nextcloud_client: - # No access verification, return result directly - results.append( - SearchResult( - id=doc_id, - doc_type=doc_type, - title=result.payload["title"], - excerpt=result.payload["excerpt"], - score=result.score, - metadata={ - "chunk_index": result.payload.get("chunk_index"), - "total_chunks": result.payload.get("total_chunks"), - }, - ) + # Return unverified results (verification happens at output stage) + results.append( + SearchResult( + id=doc_id, + doc_type=doc_type, + title=result.payload.get("title", "Untitled"), + excerpt=result.payload.get("excerpt", ""), + score=result.score, + metadata={ + "chunk_index": result.payload.get("chunk_index"), + "total_chunks": result.payload.get("total_chunks"), + }, ) + ) if len(results) >= limit: break + logger.info(f"Returning {len(results)} unverified results after deduplication") + if results: + result_details = [ + f"{r.doc_type}_{r.id} (score={r.score:.3f}, title='{r.title}')" + for r in results[:5] # Show top 5 + ] + logger.debug(f"Top results: {', '.join(result_details)}") + return results - - async def _verify_document_access( - self, - nextcloud_client: NextcloudClientProtocol, - doc_id: int, - doc_type: str, - qdrant_result: Any, - ) -> SearchResult | None: - """Verify user has access to a document via Nextcloud API. - - Dispatches to appropriate client method based on document type. - - Args: - nextcloud_client: Client for API access - doc_id: Document ID - doc_type: Document type ("note", "file", "calendar", etc.) - qdrant_result: Original Qdrant search result - - Returns: - SearchResult if access verified, None if access denied or error - """ - try: - if doc_type == "note": - note = await nextcloud_client.notes.get_note(doc_id) - return SearchResult( - id=doc_id, - doc_type="note", - title=qdrant_result.payload["title"], - excerpt=qdrant_result.payload["excerpt"], - score=qdrant_result.score, - metadata={ - "category": note.get("category", ""), - "chunk_index": qdrant_result.payload["chunk_index"], - "total_chunks": qdrant_result.payload["total_chunks"], - }, - ) - elif doc_type == "file": - # Future: verify file access when files are indexed - logger.info( - f"File {doc_id} found in search but file verification not yet implemented" - ) - return None - elif doc_type == "calendar": - # Future: verify calendar access when calendar events are indexed - logger.info( - f"Calendar event {doc_id} found in search but calendar verification not yet implemented" - ) - return None - else: - logger.warning( - f"Unknown document type '{doc_type}' for doc_id {doc_id}" - ) - return None - - except HTTPStatusError as e: - if e.response.status_code in (403, 404): - # User lost access or document deleted - logger.debug(f"Skipping {doc_type} {doc_id}: {e.response.status_code}") - return None - else: - # Log other errors but continue processing - logger.warning( - f"Error verifying access to {doc_type} {doc_id}: {e.response.status_code}" - ) - return None diff --git a/nextcloud_mcp_server/search/verification.py b/nextcloud_mcp_server/search/verification.py new file mode 100644 index 0000000..ea64621 --- /dev/null +++ b/nextcloud_mcp_server/search/verification.py @@ -0,0 +1,122 @@ +"""Access verification for search results. + +This module provides centralized verification of Nextcloud access permissions +for search results. Verification happens at the final output stage (MCP tool/viz endpoint) +rather than within individual search algorithms, preventing redundant API calls. + +Key benefits: +- Deduplication: Each document verified exactly once (even in hybrid mode) +- Parallel execution: All verifications run concurrently via anyio task groups +- Separation of concerns: Algorithms handle scoring, this module handles security +""" + +import logging +from dataclasses import replace +from typing import Protocol + +import anyio + +from nextcloud_mcp_server.search.algorithms import SearchResult + +logger = logging.getLogger(__name__) + + +class NextcloudClientProtocol(Protocol): + """Protocol for Nextcloud client with app-specific access.""" + + @property + def notes(self): + """Notes client for accessing notes API.""" + ... + + +async def verify_search_results( + results: list[SearchResult], + nextcloud_client: NextcloudClientProtocol, +) -> list[SearchResult]: + """ + Verify Nextcloud access for search results. + + Deduplicates by (doc_id, doc_type), verifies in parallel using anyio task groups, + and filters out inaccessible documents. Maintains original result ordering. + + Args: + results: Unverified search results from Qdrant + nextcloud_client: Nextcloud client for access checks + + Returns: + Verified and accessible results (same order as input) + + Example: + >>> unverified = await search_algo.search(query="test", limit=10) + >>> verified = await verify_search_results(unverified, client) + >>> # verified contains only documents user can access + """ + # Deduplicate by (doc_id, doc_type) while preserving order + # This is critical for hybrid search where same doc may appear in multiple algorithm results + seen = set() + unique_results = [] + for result in results: + key = (result.id, result.doc_type) + if key not in seen: + seen.add(key) + unique_results.append(result) + + if not unique_results: + return [] + + logger.debug( + f"Verifying access for {len(unique_results)} unique documents " + f"(from {len(results)} total results)" + ) + + # Verify all unique documents in parallel using anyio task group + # Use list to maintain order (index-based storage) + verified_results = [None] * len(unique_results) + + async def verify_one(index: int, result: SearchResult): + """ + Verify a single document and store result at index. + + Args: + index: Position in verified_results list + result: Search result to verify + """ + try: + if result.doc_type == "note": + # Fetch note to verify access and get fresh metadata + note = await nextcloud_client.notes.get_note(result.id) + # Update metadata with fresh data from Nextcloud + updated_metadata = {**(result.metadata or {}), **note} + verified_results[index] = replace(result, metadata=updated_metadata) + # TODO: Add verification for other doc types (calendar, deck, file, etc.) + else: + # For now, assume other types are accessible + # In production, add proper verification for each type + logger.debug( + f"No verification implemented for doc_type={result.doc_type}, " + "assuming accessible" + ) + verified_results[index] = result + + except Exception as e: + # Document is inaccessible (403, 404, or other error) + # Log at debug level since this is expected for filtered results + logger.debug(f"Document {result.doc_type}/{result.id} not accessible: {e}") + verified_results[index] = None + + # Run all verifications in parallel using anyio task group + # This provides structured concurrency with automatic cancellation on errors + async with anyio.create_task_group() as tg: + for idx, result in enumerate(unique_results): + tg.start_soon(verify_one, idx, result) + + # Filter out None (inaccessible) and return verified results + accessible = [r for r in verified_results if r is not None] + + logger.debug( + f"Verification complete: {len(accessible)} accessible, " + f"{len(unique_results) - len(accessible)} filtered out" + ) + + return accessible diff --git a/nextcloud_mcp_server/server/semantic.py b/nextcloud_mcp_server/server/semantic.py index d000b11..fe13c1e 100644 --- a/nextcloud_mcp_server/server/semantic.py +++ b/nextcloud_mcp_server/server/semantic.py @@ -128,35 +128,36 @@ def configure_semantic_tools(mcp: FastMCP): if doc_types is None: # Cross-app search: search all indexed types - # Pass None to search algorithm to let it query Qdrant for available types - search_results = await search_algo.search( + # Get unverified results from Qdrant + unverified_results = await search_algo.search( query=query, user_id=username, - limit=limit, + limit=limit * 2, # Get extra for access filtering doc_type=None, # Signal to search all types - nextcloud_client=client, score_threshold=score_threshold, ) - all_results.extend(search_results) + all_results.extend(unverified_results) else: # Search specific document types # For each requested type, execute search and combine results for dtype in doc_types: - search_results = await search_algo.search( + unverified_results = await search_algo.search( query=query, user_id=username, - limit=limit * 2, # Get extra for combining + limit=limit * 2, # Get extra for combining and filtering doc_type=dtype, - nextcloud_client=client, score_threshold=score_threshold, ) - all_results.extend(search_results) + all_results.extend(unverified_results) - # Sort combined results by score and limit + # Sort combined results by score all_results.sort(key=lambda r: r.score, reverse=True) - all_results = all_results[:limit] - search_results = all_results + # Verify access for all results (deduplicates and filters) + from nextcloud_mcp_server.search.verification import verify_search_results + + verified_results = await verify_search_results(all_results, client) + search_results = verified_results[:limit] # Final limit after verification # Convert SearchResult objects to SemanticSearchResult for response results = []