42376483ab
Move access verification from individual search algorithms to final output stage, eliminating redundant API calls and improving performance. ## Changes **New:** - `search/verification.py`: Centralized verification using anyio task groups - Deduplicates results by (doc_id, doc_type) before verification - Verifies all unique documents in parallel using structured concurrency - Filters out inaccessible documents in single pass **Modified Search Algorithms:** - `search/semantic.py`: Removed _deduplicate_and_verify() and _verify_document_access() - `search/keyword.py`: Removed _verify_access() and parallel verification - `search/fuzzy.py`: Removed _verify_access() and parallel verification - `search/hybrid.py`: Removed nextcloud_client parameter passing All algorithms now return unverified results from Qdrant payload. **Modified Output Stages:** - `server/semantic.py`: Added verify_search_results() call after search - `auth/viz_routes.py`: Added verify_search_results() call after search Both endpoints now verify access once at final stage with deduplication. ## Performance Impact **Before:** - Hybrid mode (limit=10): 30 API calls (10 per algorithm × 3 algorithms) - Single algorithm: 10-20 API calls (with verification buffer) **After:** - Hybrid mode (limit=10): 10 API calls (deduplicated verification) - Single algorithm: 10 API calls (deduplicated verification) **Performance Gain:** 3x reduction in API calls for hybrid search ## Architecture Benefits - **Separation of concerns**: Algorithms handle scoring, output stage handles security - **Deduplication**: Each document verified exactly once - **Parallel execution**: All verifications run concurrently via anyio task groups - **Consistency**: Same verification logic across MCP tools and viz endpoints 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
611 lines
24 KiB
Python
611 lines
24 KiB
Python
"""Vector visualization routes for testing search algorithms.
|
|
|
|
Provides a web UI for users to test different search algorithms on their own
|
|
indexed documents and visualize results in 2D space using PCA.
|
|
|
|
All processing happens server-side following ADR-012:
|
|
- Search execution via shared search/algorithms.py
|
|
- PCA dimensionality reduction (768-dim → 2D)
|
|
- Only 2D coordinates + metadata sent to client
|
|
- Bandwidth-efficient (2 floats per doc vs 768)
|
|
"""
|
|
|
|
import logging
|
|
|
|
import numpy as np
|
|
from starlette.authentication import requires
|
|
from starlette.requests import Request
|
|
from starlette.responses import HTMLResponse, JSONResponse
|
|
|
|
from nextcloud_mcp_server.config import get_settings
|
|
from nextcloud_mcp_server.search import (
|
|
FuzzySearchAlgorithm,
|
|
HybridSearchAlgorithm,
|
|
KeywordSearchAlgorithm,
|
|
SemanticSearchAlgorithm,
|
|
)
|
|
from nextcloud_mcp_server.vector.pca import PCA
|
|
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
@requires("authenticated", redirect="oauth_login")
|
|
async def vector_visualization_html(request: Request) -> HTMLResponse:
|
|
"""Vector visualization page with search controls and interactive plot.
|
|
|
|
Provides UI for testing search algorithms with real-time visualization.
|
|
Requires vector sync to be enabled.
|
|
|
|
Args:
|
|
request: Starlette request object
|
|
|
|
Returns:
|
|
HTML page with search interface
|
|
"""
|
|
settings = get_settings()
|
|
|
|
if not settings.vector_sync_enabled:
|
|
return HTMLResponse(
|
|
"""
|
|
<div>
|
|
<h2>Vector Visualization</h2>
|
|
<div style="padding: 20px; background: #fff3cd; border: 1px solid #ffc107; border-radius: 4px;">
|
|
Vector sync is not enabled. Set VECTOR_SYNC_ENABLED=true to use this feature.
|
|
</div>
|
|
</div>
|
|
"""
|
|
)
|
|
|
|
# Get user info from auth context
|
|
username = (
|
|
request.user.display_name
|
|
if hasattr(request.user, "display_name")
|
|
else "unknown"
|
|
)
|
|
|
|
html_content = f"""
|
|
<style>
|
|
.viz-card {{
|
|
background: white;
|
|
border-radius: 8px;
|
|
padding: 20px;
|
|
margin-bottom: 20px;
|
|
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
|
}}
|
|
.viz-controls {{
|
|
margin-bottom: 20px;
|
|
}}
|
|
.viz-control-row {{
|
|
display: grid;
|
|
grid-template-columns: 2fr 1fr auto;
|
|
gap: 12px;
|
|
margin-bottom: 12px;
|
|
align-items: end;
|
|
}}
|
|
.viz-control-group {{
|
|
margin-bottom: 15px;
|
|
}}
|
|
.viz-control-group label {{
|
|
display: block;
|
|
margin-bottom: 5px;
|
|
font-weight: 500;
|
|
color: #333;
|
|
}}
|
|
.viz-control-group input[type="text"],
|
|
.viz-control-group input[type="number"],
|
|
.viz-control-group select {{
|
|
width: 100%;
|
|
padding: 8px 12px;
|
|
border: 1px solid #ddd;
|
|
border-radius: 4px;
|
|
font-size: 14px;
|
|
}}
|
|
.viz-control-group input[type="range"] {{
|
|
width: 100%;
|
|
}}
|
|
.viz-control-group select[multiple] {{
|
|
min-height: 100px;
|
|
}}
|
|
.viz-weight-display {{
|
|
display: inline-block;
|
|
min-width: 40px;
|
|
text-align: right;
|
|
color: #666;
|
|
}}
|
|
.viz-btn {{
|
|
background: #0066cc;
|
|
color: white;
|
|
border: none;
|
|
padding: 10px 20px;
|
|
border-radius: 4px;
|
|
cursor: pointer;
|
|
font-size: 14px;
|
|
font-weight: 500;
|
|
}}
|
|
.viz-btn:hover {{
|
|
background: #0052a3;
|
|
}}
|
|
.viz-btn-secondary {{
|
|
background: #6c757d;
|
|
color: white;
|
|
border: none;
|
|
padding: 6px 12px;
|
|
border-radius: 4px;
|
|
cursor: pointer;
|
|
font-size: 13px;
|
|
margin-bottom: 12px;
|
|
}}
|
|
.viz-btn-secondary:hover {{
|
|
background: #5a6268;
|
|
}}
|
|
#viz-plot-container {{
|
|
width: 100%;
|
|
height: 600px;
|
|
position: relative;
|
|
}}
|
|
#viz-plot {{
|
|
width: 100%;
|
|
height: 100%;
|
|
}}
|
|
.viz-loading {{
|
|
text-align: center;
|
|
padding: 40px;
|
|
color: #666;
|
|
}}
|
|
.viz-loading-overlay {{
|
|
position: absolute;
|
|
inset: 0;
|
|
display: flex;
|
|
align-items: center;
|
|
justify-content: center;
|
|
background: white;
|
|
color: #666;
|
|
}}
|
|
.viz-no-results {{
|
|
text-align: center;
|
|
padding: 40px;
|
|
color: #666;
|
|
font-style: italic;
|
|
}}
|
|
.viz-advanced-section {{
|
|
margin-top: 16px;
|
|
padding: 16px;
|
|
background: #f8f9fa;
|
|
border-radius: 4px;
|
|
border: 1px solid #dee2e6;
|
|
}}
|
|
.viz-advanced-grid {{
|
|
display: grid;
|
|
grid-template-columns: 1fr 1fr;
|
|
gap: 20px;
|
|
}}
|
|
.viz-info-box {{
|
|
background: #e3f2fd;
|
|
border-left: 4px solid #2196f3;
|
|
padding: 12px;
|
|
margin-bottom: 20px;
|
|
font-size: 14px;
|
|
}}
|
|
</style>
|
|
|
|
<div x-data="vizApp()">
|
|
<div class="viz-card">
|
|
<h2>Vector Visualization</h2>
|
|
<div class="viz-info-box">
|
|
Testing search algorithms on your indexed documents. User: <strong>{username}</strong>
|
|
</div>
|
|
|
|
<form @submit.prevent="executeSearch">
|
|
<div class="viz-controls">
|
|
<!-- Main Controls -->
|
|
<div class="viz-control-group">
|
|
<label>Search Query</label>
|
|
<input type="text" x-model="query" placeholder="Enter search query..." required />
|
|
</div>
|
|
|
|
<div class="viz-control-row">
|
|
<div class="viz-control-group" style="margin-bottom: 0;">
|
|
<label>Algorithm</label>
|
|
<select x-model="algorithm">
|
|
<option value="semantic">Semantic (Vector Similarity)</option>
|
|
<option value="keyword">Keyword (Token Matching)</option>
|
|
<option value="fuzzy">Fuzzy (Character Overlap)</option>
|
|
<option value="hybrid" selected>Hybrid (RRF Fusion)</option>
|
|
</select>
|
|
</div>
|
|
|
|
<div style="display: flex; align-items: flex-end;">
|
|
<button type="submit" class="viz-btn" style="width: 100%;">Search & Visualize</button>
|
|
</div>
|
|
|
|
<div style="display: flex; align-items: flex-end;">
|
|
<button type="button" class="viz-btn-secondary" @click="showAdvanced = !showAdvanced" style="white-space: nowrap;">
|
|
<span x-text="showAdvanced ? 'Hide Advanced' : 'Advanced'"></span>
|
|
</button>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- Advanced Options (Collapsible) -->
|
|
<div class="viz-advanced-section" x-show="showAdvanced" x-transition.opacity.duration.200ms>
|
|
<h3 style="margin-top: 0; margin-bottom: 16px; font-size: 16px;">Advanced Options</h3>
|
|
|
|
<div class="viz-advanced-grid">
|
|
<div class="viz-control-group">
|
|
<label>Document Types</label>
|
|
<select x-model="docTypes" multiple>
|
|
<option value="">All Types (cross-app search)</option>
|
|
<option value="note">Notes</option>
|
|
<option value="file">Files</option>
|
|
<option value="calendar">Calendar Events</option>
|
|
<option value="contact">Contacts</option>
|
|
<option value="deck">Deck Cards</option>
|
|
</select>
|
|
<small style="color: #666; display: block; margin-top: 4px;">
|
|
Hold Ctrl/Cmd to select multiple
|
|
</small>
|
|
</div>
|
|
|
|
<div>
|
|
<div class="viz-control-group">
|
|
<label>Score Threshold (Semantic/Hybrid)</label>
|
|
<input type="number" x-model.number="scoreThreshold" min="0" max="1" step="0.1" />
|
|
</div>
|
|
|
|
<div class="viz-control-group">
|
|
<label>Result Limit</label>
|
|
<input type="number" x-model.number="limit" min="1" max="100" />
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- Hybrid Weights (only when hybrid selected) -->
|
|
<div x-show="algorithm === 'hybrid'" style="margin-top: 16px; padding: 12px; background: #e9ecef; border-radius: 4px;">
|
|
<label style="margin-bottom: 12px; display: block;">Hybrid Algorithm Weights</label>
|
|
|
|
<div style="margin-bottom: 8px;">
|
|
<label style="display: inline-block; width: 100px; font-weight: normal;">Semantic:</label>
|
|
<input type="range" x-model.number="semanticWeight" min="0" max="1" step="0.1" style="width: 200px; display: inline-block;">
|
|
<span class="viz-weight-display" x-text="semanticWeight.toFixed(1)"></span>
|
|
</div>
|
|
<div style="margin-bottom: 8px;">
|
|
<label style="display: inline-block; width: 100px; font-weight: normal;">Keyword:</label>
|
|
<input type="range" x-model.number="keywordWeight" min="0" max="1" step="0.1" style="width: 200px; display: inline-block;">
|
|
<span class="viz-weight-display" x-text="keywordWeight.toFixed(1)"></span>
|
|
</div>
|
|
<div>
|
|
<label style="display: inline-block; width: 100px; font-weight: normal;">Fuzzy:</label>
|
|
<input type="range" x-model.number="fuzzyWeight" min="0" max="1" step="0.1" style="width: 200px; display: inline-block;">
|
|
<span class="viz-weight-display" x-text="fuzzyWeight.toFixed(1)"></span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</form>
|
|
</div>
|
|
|
|
<div class="viz-card">
|
|
<div id="viz-plot-container">
|
|
<div x-show="loading" class="viz-loading-overlay" x-transition.opacity.duration.200ms>
|
|
Executing search and computing PCA projection...
|
|
</div>
|
|
<div id="viz-plot" x-show="!loading" x-transition.opacity.duration.200ms></div>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="viz-card">
|
|
<h3>Search Results (<span x-text="loading ? '...' : results.length"></span>)</h3>
|
|
|
|
<div x-show="loading" class="viz-loading" x-transition.opacity.duration.200ms>
|
|
Loading results...
|
|
</div>
|
|
|
|
<div x-show="!loading && results.length === 0" class="viz-no-results" x-transition.opacity.duration.200ms>
|
|
No results found. Try a different query or adjust your search parameters.
|
|
</div>
|
|
|
|
<template x-if="!loading && results.length > 0">
|
|
<div x-transition.opacity.duration.200ms>
|
|
<template x-for="result in results" :key="result.id">
|
|
<div style="padding: 12px; border-bottom: 1px solid #eee;">
|
|
<a :href="getNextcloudUrl(result)" target="_blank" style="font-weight: 500; color: #0066cc; text-decoration: none;">
|
|
<span x-text="result.title"></span>
|
|
</a>
|
|
<div style="font-size: 14px; color: #666; margin-top: 4px;" x-text="result.excerpt"></div>
|
|
<div style="font-size: 12px; color: #999; margin-top: 4px;">
|
|
Score: <span x-text="result.score.toFixed(3)"></span> |
|
|
Type: <span x-text="result.doc_type"></span>
|
|
</div>
|
|
</div>
|
|
</template>
|
|
</div>
|
|
</template>
|
|
</div>
|
|
</div>
|
|
"""
|
|
|
|
return HTMLResponse(content=html_content)
|
|
|
|
|
|
@requires("authenticated", redirect="oauth_login")
|
|
async def vector_visualization_search(request: Request) -> JSONResponse:
|
|
"""Execute server-side search and return 2D coordinates + results.
|
|
|
|
All processing happens server-side:
|
|
1. Execute search via shared algorithm module
|
|
2. Fetch matching vectors from Qdrant
|
|
3. Apply PCA reduction (768-dim → 2D)
|
|
4. Return coordinates + metadata only
|
|
|
|
Args:
|
|
request: Starlette request with query parameters
|
|
|
|
Returns:
|
|
JSON response with coordinates_2d and results
|
|
"""
|
|
settings = get_settings()
|
|
|
|
if not settings.vector_sync_enabled:
|
|
return JSONResponse(
|
|
{"success": False, "error": "Vector sync not enabled"},
|
|
status_code=400,
|
|
)
|
|
|
|
# Get user info from auth context
|
|
username = (
|
|
request.user.display_name if hasattr(request.user, "display_name") else None
|
|
)
|
|
|
|
if not username:
|
|
return JSONResponse(
|
|
{"success": False, "error": "User not authenticated"},
|
|
status_code=401,
|
|
)
|
|
|
|
# Parse query parameters
|
|
query = request.query_params.get("query", "")
|
|
algorithm = request.query_params.get("algorithm", "hybrid")
|
|
limit = int(request.query_params.get("limit", "50"))
|
|
score_threshold = float(request.query_params.get("score_threshold", "0.7"))
|
|
semantic_weight = float(request.query_params.get("semantic_weight", "0.5"))
|
|
keyword_weight = float(request.query_params.get("keyword_weight", "0.3"))
|
|
fuzzy_weight = float(request.query_params.get("fuzzy_weight", "0.2"))
|
|
|
|
# Parse doc_types (comma-separated list, None = all types)
|
|
doc_types_param = request.query_params.get("doc_types", "")
|
|
doc_types = doc_types_param.split(",") if doc_types_param else None
|
|
|
|
logger.info(
|
|
f"Viz search: user={username}, query='{query}', "
|
|
f"algorithm={algorithm}, limit={limit}, doc_types={doc_types}"
|
|
)
|
|
|
|
try:
|
|
# Get authenticated HTTP client from session
|
|
# In BasicAuth mode: uses username/password from session
|
|
# In OAuth mode: uses access token from session
|
|
from nextcloud_mcp_server.auth.userinfo_routes import (
|
|
_get_authenticated_client_for_userinfo,
|
|
)
|
|
from nextcloud_mcp_server.client.notes import NotesClient
|
|
|
|
async with await _get_authenticated_client_for_userinfo(request) as http_client:
|
|
# Create NotesClient directly with authenticated HTTP client
|
|
notes_client = NotesClient(http_client, username)
|
|
|
|
# Wrap in a minimal client object for search algorithms
|
|
# This conforms to NextcloudClientProtocol but only implements notes
|
|
class MinimalNextcloudClient:
|
|
def __init__(self, notes_client, username):
|
|
self._notes = notes_client
|
|
self.username = username
|
|
|
|
@property
|
|
def notes(self):
|
|
return self._notes
|
|
|
|
@property
|
|
def webdav(self):
|
|
return None
|
|
|
|
@property
|
|
def calendar(self):
|
|
return None
|
|
|
|
@property
|
|
def contacts(self):
|
|
return None
|
|
|
|
@property
|
|
def deck(self):
|
|
return None
|
|
|
|
@property
|
|
def cookbook(self):
|
|
return None
|
|
|
|
@property
|
|
def tables(self):
|
|
return None
|
|
|
|
nextcloud_client = MinimalNextcloudClient(notes_client, username)
|
|
|
|
# Create search algorithm
|
|
if algorithm == "semantic":
|
|
search_algo = SemanticSearchAlgorithm(score_threshold=score_threshold)
|
|
elif algorithm == "keyword":
|
|
search_algo = KeywordSearchAlgorithm()
|
|
elif algorithm == "fuzzy":
|
|
search_algo = FuzzySearchAlgorithm()
|
|
elif algorithm == "hybrid":
|
|
search_algo = HybridSearchAlgorithm(
|
|
semantic_weight=semantic_weight,
|
|
keyword_weight=keyword_weight,
|
|
fuzzy_weight=fuzzy_weight,
|
|
)
|
|
else:
|
|
return JSONResponse(
|
|
{"success": False, "error": f"Unknown algorithm: {algorithm}"},
|
|
status_code=400,
|
|
)
|
|
|
|
# Execute search (supports cross-app when doc_types=None)
|
|
# Get unverified results with buffer for filtering
|
|
all_results = []
|
|
if doc_types is None or len(doc_types) == 0:
|
|
# Cross-app search - search all indexed types
|
|
unverified_results = await search_algo.search(
|
|
query=query,
|
|
user_id=username,
|
|
limit=limit * 2, # Buffer for verification filtering
|
|
doc_type=None, # Search all types
|
|
score_threshold=score_threshold,
|
|
)
|
|
all_results.extend(unverified_results)
|
|
else:
|
|
# Search each document type and combine
|
|
for doc_type in doc_types:
|
|
unverified_results = await search_algo.search(
|
|
query=query,
|
|
user_id=username,
|
|
limit=limit * 2, # Buffer for verification filtering
|
|
doc_type=doc_type,
|
|
score_threshold=score_threshold,
|
|
)
|
|
all_results.extend(unverified_results)
|
|
# Sort by score before verification
|
|
all_results.sort(key=lambda r: r.score, reverse=True)
|
|
|
|
# Verify access for all results (deduplicates and filters)
|
|
from nextcloud_mcp_server.search.verification import verify_search_results
|
|
|
|
verified_results = await verify_search_results(
|
|
all_results, nextcloud_client
|
|
)
|
|
search_results = verified_results[:limit]
|
|
|
|
if not search_results:
|
|
return JSONResponse(
|
|
{
|
|
"success": True,
|
|
"results": [],
|
|
"coordinates_2d": [],
|
|
"message": "No results found",
|
|
}
|
|
)
|
|
|
|
# Fetch vectors for matching results from Qdrant
|
|
qdrant_client = await get_qdrant_client()
|
|
doc_ids = [r.id for r in search_results]
|
|
|
|
# Retrieve vectors for the matching documents
|
|
from qdrant_client.models import FieldCondition, Filter, MatchAny
|
|
|
|
points_response = await qdrant_client.scroll(
|
|
collection_name=settings.get_collection_name(),
|
|
scroll_filter=Filter(
|
|
must=[
|
|
FieldCondition(
|
|
key="doc_id",
|
|
match=MatchAny(any=[str(doc_id) for doc_id in doc_ids]),
|
|
),
|
|
FieldCondition(
|
|
key="user_id",
|
|
match={"value": username},
|
|
),
|
|
]
|
|
),
|
|
limit=len(doc_ids) * 2, # Account for multiple chunks per doc
|
|
with_vectors=True,
|
|
with_payload=["doc_id"], # Need doc_id to map vectors to results
|
|
)
|
|
|
|
points = points_response[0]
|
|
|
|
if not points:
|
|
return JSONResponse(
|
|
{
|
|
"success": True,
|
|
"results": [],
|
|
"coordinates_2d": [],
|
|
"message": "No vectors found for results",
|
|
}
|
|
)
|
|
|
|
# Extract vectors
|
|
vectors = np.array([p.vector for p in points if p.vector is not None])
|
|
|
|
if len(vectors) < 2:
|
|
# Not enough points for PCA
|
|
return JSONResponse(
|
|
{
|
|
"success": True,
|
|
"results": [
|
|
{
|
|
"id": r.id,
|
|
"doc_type": r.doc_type,
|
|
"title": r.title,
|
|
"excerpt": r.excerpt,
|
|
"score": r.score,
|
|
}
|
|
for r in search_results
|
|
],
|
|
"coordinates_2d": [[0, 0]] * len(search_results),
|
|
"message": "Not enough vectors for PCA",
|
|
}
|
|
)
|
|
|
|
# Apply PCA dimensionality reduction (768-dim → 2D)
|
|
pca = PCA(n_components=2)
|
|
coords_2d = pca.fit_transform(vectors)
|
|
|
|
# After fit, these attributes are guaranteed to be set
|
|
assert pca.explained_variance_ratio_ is not None
|
|
|
|
logger.info(
|
|
f"PCA explained variance: PC1={pca.explained_variance_ratio_[0]:.3f}, "
|
|
f"PC2={pca.explained_variance_ratio_[1]:.3f}"
|
|
)
|
|
|
|
# Map results to coordinates (use first chunk per document)
|
|
result_coords = []
|
|
seen_doc_ids = set()
|
|
|
|
for point, coord in zip(points, coords_2d):
|
|
if point.payload:
|
|
doc_id = int(point.payload.get("doc_id", 0))
|
|
if doc_id not in seen_doc_ids and doc_id in doc_ids:
|
|
seen_doc_ids.add(doc_id)
|
|
result_coords.append(coord.tolist())
|
|
|
|
# Build response
|
|
response_results = [
|
|
{
|
|
"id": r.id,
|
|
"doc_type": r.doc_type,
|
|
"title": r.title,
|
|
"excerpt": r.excerpt,
|
|
"score": r.score,
|
|
}
|
|
for r in search_results
|
|
]
|
|
|
|
return JSONResponse(
|
|
{
|
|
"success": True,
|
|
"results": response_results,
|
|
"coordinates_2d": result_coords[: len(search_results)],
|
|
"pca_variance": {
|
|
"pc1": float(pca.explained_variance_ratio_[0]),
|
|
"pc2": float(pca.explained_variance_ratio_[1]),
|
|
},
|
|
}
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Viz search error: {e}", exc_info=True)
|
|
return JSONResponse(
|
|
{"success": False, "error": str(e)},
|
|
status_code=500,
|
|
)
|