Files
nextcloud-mcp-server/nextcloud_mcp_server/auth/viz_routes.py
T
Chris Coutinho 916af1c8f3 feat: Add vector visualization pane with multi-select document types
- Add /app/vector-viz endpoint for interactive search testing
- Implement server-side PCA dimensionality reduction (768-dim → 2D)
- Support multi-select document type filter for cross-app search
- Support all search algorithms: semantic, keyword, fuzzy, hybrid
- Display 2D scatter plot of vector embeddings using Plotly
- Show search results with scores and document types
- Register viz routes in app.py
2025-11-15 02:32:10 +01:00

637 lines
25 KiB
Python

"""Vector visualization routes for testing search algorithms.
Provides a web UI for users to test different search algorithms on their own
indexed documents and visualize results in 2D space using PCA.
All processing happens server-side following ADR-012:
- Search execution via shared search/algorithms.py
- PCA dimensionality reduction (768-dim → 2D)
- Only 2D coordinates + metadata sent to client
- Bandwidth-efficient (2 floats per doc vs 768)
"""
import logging
import numpy as np
from starlette.authentication import requires
from starlette.requests import Request
from starlette.responses import HTMLResponse, JSONResponse
from nextcloud_mcp_server.config import get_settings
from nextcloud_mcp_server.search import (
FuzzySearchAlgorithm,
HybridSearchAlgorithm,
KeywordSearchAlgorithm,
SemanticSearchAlgorithm,
)
from nextcloud_mcp_server.vector.pca import PCA
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
logger = logging.getLogger(__name__)
@requires("authenticated", redirect="oauth_login")
async def vector_visualization_html(request: Request) -> HTMLResponse:
"""Vector visualization page with search controls and interactive plot.
Provides UI for testing search algorithms with real-time visualization.
Requires vector sync to be enabled.
Args:
request: Starlette request object
Returns:
HTML page with search interface
"""
settings = get_settings()
if not settings.vector_sync_enabled:
return HTMLResponse(
"""
<div>
<h2>Vector Visualization</h2>
<div style="padding: 20px; background: #fff3cd; border: 1px solid #ffc107; border-radius: 4px;">
Vector sync is not enabled. Set VECTOR_SYNC_ENABLED=true to use this feature.
</div>
</div>
"""
)
# Get user info from session
user_info = request.session.get("user_info", {})
username = user_info.get("preferred_username", "unknown")
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Vector Visualization - Nextcloud MCP</title>
<script src="https://cdn.plot.ly/plotly-2.26.0.min.js"></script>
<script src="https://unpkg.com/htmx.org@1.9.10"></script>
<script src="https://unpkg.com/alpinejs@3.13.3/dist/cdn.min.js" defer></script>
<style>
body {{
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
margin: 0;
padding: 20px;
background: #f5f5f5;
}}
.container {{
max-width: 1400px;
margin: 0 auto;
}}
.card {{
background: white;
border-radius: 8px;
padding: 20px;
margin-bottom: 20px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
}}
.controls {{
display: grid;
grid-template-columns: 1fr 1fr;
gap: 20px;
margin-bottom: 20px;
}}
.control-group {{
margin-bottom: 15px;
}}
label {{
display: block;
margin-bottom: 5px;
font-weight: 500;
color: #333;
}}
input[type="text"], select {{
width: 100%;
padding: 8px 12px;
border: 1px solid #ddd;
border-radius: 4px;
font-size: 14px;
}}
input[type="range"] {{
width: 100%;
}}
.weight-display {{
display: inline-block;
min-width: 40px;
text-align: right;
color: #666;
}}
.btn {{
background: #0066cc;
color: white;
border: none;
padding: 10px 20px;
border-radius: 4px;
cursor: pointer;
font-size: 14px;
font-weight: 500;
}}
.btn:hover {{
background: #0052a3;
}}
#plot {{
width: 100%;
height: 600px;
}}
.loading {{
text-align: center;
padding: 40px;
color: #666;
}}
.weight-controls {{
display: none;
}}
.weight-controls.active {{
display: block;
}}
.info-box {{
background: #e3f2fd;
border-left: 4px solid #2196f3;
padding: 12px;
margin-bottom: 20px;
font-size: 14px;
}}
</style>
</head>
<body>
<div class="container" x-data="vizApp()">
<div class="card">
<h1>Vector Visualization</h1>
<div class="info-box">
Testing search algorithms on your indexed documents. User: <strong>{username}</strong>
</div>
<form @submit.prevent="executeSearch">
<div class="controls">
<div>
<div class="control-group">
<label>Search Query</label>
<input type="text" x-model="query" placeholder="Enter search query..." />
</div>
<div class="control-group">
<label>Search Algorithm</label>
<select x-model="algorithm" @change="updateWeightControls">
<option value="semantic">Semantic (Vector Similarity)</option>
<option value="keyword">Keyword (Token Matching)</option>
<option value="fuzzy">Fuzzy (Character Overlap)</option>
<option value="hybrid" selected>Hybrid (RRF Fusion)</option>
</select>
</div>
<div class="control-group">
<label>Document Types (multi-select)</label>
<select x-model="docTypes" multiple size="4" style="height: auto;">
<option value="">All Types (cross-app search)</option>
<option value="note">Notes</option>
<option value="file">Files</option>
<option value="calendar">Calendar Events</option>
<option value="contact">Contacts</option>
<option value="deck">Deck Cards</option>
</select>
<small style="color: #666; display: block; margin-top: 4px;">
Hold Ctrl/Cmd to select multiple. Select "All Types" for cross-app search.
</small>
</div>
<div class="control-group weight-controls" :class="{{ active: algorithm === 'hybrid' }}">
<label>Hybrid Weights</label>
<div style="margin-bottom: 8px;">
<label style="display: inline-block; width: 100px;">Semantic:</label>
<input type="range" x-model.number="semanticWeight" min="0" max="1" step="0.1" style="width: 200px; display: inline-block;">
<span class="weight-display" x-text="semanticWeight.toFixed(1)"></span>
</div>
<div style="margin-bottom: 8px;">
<label style="display: inline-block; width: 100px;">Keyword:</label>
<input type="range" x-model.number="keywordWeight" min="0" max="1" step="0.1" style="width: 200px; display: inline-block;">
<span class="weight-display" x-text="keywordWeight.toFixed(1)"></span>
</div>
<div>
<label style="display: inline-block; width: 100px;">Fuzzy:</label>
<input type="range" x-model.number="fuzzyWeight" min="0" max="1" step="0.1" style="width: 200px; display: inline-block;">
<span class="weight-display" x-text="fuzzyWeight.toFixed(1)"></span>
</div>
</div>
</div>
<div>
<div class="control-group">
<label>Result Limit</label>
<input type="number" x-model.number="limit" min="1" max="100" value="50" />
</div>
<div class="control-group">
<label>Score Threshold (Semantic/Hybrid)</label>
<input type="number" x-model.number="scoreThreshold" min="0" max="1" step="0.1" value="0.7" />
</div>
<div class="control-group">
<button type="submit" class="btn">Search & Visualize</button>
</div>
</div>
</div>
</form>
</div>
<div class="card">
<div x-show="loading" class="loading">
Executing search and computing PCA projection...
</div>
<div id="plot" x-show="!loading"></div>
</div>
<div class="card" x-show="results.length > 0">
<h2>Search Results (<span x-text="results.length"></span>)</h2>
<template x-for="result in results" :key="result.id">
<div style="padding: 12px; border-bottom: 1px solid #eee;">
<div style="font-weight: 500; color: #0066cc;" x-text="result.title"></div>
<div style="font-size: 14px; color: #666; margin-top: 4px;" x-text="result.excerpt"></div>
<div style="font-size: 12px; color: #999; margin-top: 4px;">
Score: <span x-text="result.score.toFixed(3)"></span> |
Type: <span x-text="result.doc_type"></span>
</div>
</div>
</template>
</div>
</div>
<script>
function vizApp() {{
return {{
query: '',
algorithm: 'hybrid',
docTypes: [''], // Default to "All Types"
limit: 50,
scoreThreshold: 0.7,
semanticWeight: 0.5,
keywordWeight: 0.3,
fuzzyWeight: 0.2,
loading: false,
results: [],
updateWeightControls() {{
// Update weight controls visibility based on algorithm
}},
async executeSearch() {{
this.loading = true;
this.results = [];
try {{
const params = new URLSearchParams({{
query: this.query,
algorithm: this.algorithm,
limit: this.limit,
score_threshold: this.scoreThreshold,
semantic_weight: this.semanticWeight,
keyword_weight: this.keywordWeight,
fuzzy_weight: this.fuzzyWeight,
}});
// Add doc_types parameter (filter out empty string for "All Types")
const selectedTypes = this.docTypes.filter(t => t !== '');
if (selectedTypes.length > 0) {{
params.append('doc_types', selectedTypes.join(','));
}}
const response = await fetch(`/app/vector-viz/search?${{params}}`);
const data = await response.json();
if (data.success) {{
this.results = data.results;
this.renderPlot(data.coordinates_2d, data.results);
}} else {{
alert('Search failed: ' + data.error);
}}
}} catch (error) {{
alert('Error: ' + error.message);
}} finally {{
this.loading = false;
}}
}},
renderPlot(coordinates, results) {{
const trace = {{
x: coordinates.map(c => c[0]),
y: coordinates.map(c => c[1]),
mode: 'markers',
type: 'scatter',
text: results.map(r => `${{r.title}}<br>Score: ${{r.score.toFixed(3)}}`),
marker: {{
size: 8,
color: results.map(r => r.score),
colorscale: 'Viridis',
showscale: true,
colorbar: {{ title: 'Score' }}
}}
}};
const layout = {{
title: `Vector Space (PCA 2D) - ${{results.length}} results`,
xaxis: {{ title: 'PC1' }},
yaxis: {{ title: 'PC2' }},
hovermode: 'closest',
height: 600
}};
Plotly.newPlot('plot', [trace], layout);
}}
}}
}}
</script>
</body>
</html>
"""
return HTMLResponse(content=html_content)
@requires("authenticated", redirect="oauth_login")
async def vector_visualization_search(request: Request) -> JSONResponse:
"""Execute server-side search and return 2D coordinates + results.
All processing happens server-side:
1. Execute search via shared algorithm module
2. Fetch matching vectors from Qdrant
3. Apply PCA reduction (768-dim → 2D)
4. Return coordinates + metadata only
Args:
request: Starlette request with query parameters
Returns:
JSON response with coordinates_2d and results
"""
settings = get_settings()
if not settings.vector_sync_enabled:
return JSONResponse(
{"success": False, "error": "Vector sync not enabled"},
status_code=400,
)
# Get user info
user_info = request.session.get("user_info", {})
username = user_info.get("preferred_username")
if not username:
return JSONResponse(
{"success": False, "error": "User not authenticated"},
status_code=401,
)
# Parse query parameters
query = request.query_params.get("query", "")
algorithm = request.query_params.get("algorithm", "hybrid")
limit = int(request.query_params.get("limit", "50"))
score_threshold = float(request.query_params.get("score_threshold", "0.7"))
semantic_weight = float(request.query_params.get("semantic_weight", "0.5"))
keyword_weight = float(request.query_params.get("keyword_weight", "0.3"))
fuzzy_weight = float(request.query_params.get("fuzzy_weight", "0.2"))
# Parse doc_types (comma-separated list, None = all types)
doc_types_param = request.query_params.get("doc_types", "")
doc_types = doc_types_param.split(",") if doc_types_param else None
logger.info(
f"Viz search: user={username}, query='{query}', "
f"algorithm={algorithm}, limit={limit}, doc_types={doc_types}"
)
try:
# Get authenticated HTTP client from session
# In BasicAuth mode: uses username/password from session
# In OAuth mode: uses access token from session
from nextcloud_mcp_server.auth.userinfo_routes import (
_get_authenticated_client_for_userinfo,
)
from nextcloud_mcp_server.client.notes import NotesClient
async with await _get_authenticated_client_for_userinfo(request) as http_client:
# Create NotesClient directly with authenticated HTTP client
notes_client = NotesClient(http_client, username)
# Wrap in a minimal client object for search algorithms
# This conforms to NextcloudClientProtocol but only implements notes
class MinimalNextcloudClient:
def __init__(self, notes_client, username):
self._notes = notes_client
self.username = username
@property
def notes(self):
return self._notes
@property
def webdav(self):
return None
@property
def calendar(self):
return None
@property
def contacts(self):
return None
@property
def deck(self):
return None
@property
def cookbook(self):
return None
@property
def tables(self):
return None
nextcloud_client = MinimalNextcloudClient(notes_client, username)
# Create search algorithm
if algorithm == "semantic":
search_algo = SemanticSearchAlgorithm(score_threshold=score_threshold)
elif algorithm == "keyword":
search_algo = KeywordSearchAlgorithm()
elif algorithm == "fuzzy":
search_algo = FuzzySearchAlgorithm()
elif algorithm == "hybrid":
search_algo = HybridSearchAlgorithm(
semantic_weight=semantic_weight,
keyword_weight=keyword_weight,
fuzzy_weight=fuzzy_weight,
)
else:
return JSONResponse(
{"success": False, "error": f"Unknown algorithm: {algorithm}"},
status_code=400,
)
# Execute search (supports cross-app when doc_types=None)
if doc_types is None or len(doc_types) == 0:
# Cross-app search - search all indexed types
search_results = await search_algo.search(
query=query,
user_id=username,
limit=limit,
doc_type=None, # Search all types
nextcloud_client=nextcloud_client,
score_threshold=score_threshold,
)
elif len(doc_types) == 1:
# Single document type
search_results = await search_algo.search(
query=query,
user_id=username,
limit=limit,
doc_type=doc_types[0],
nextcloud_client=nextcloud_client,
score_threshold=score_threshold,
)
else:
# Multiple document types - search each and combine
all_results = []
for doc_type in doc_types:
results = await search_algo.search(
query=query,
user_id=username,
limit=limit * 2, # Get extra per type
doc_type=doc_type,
nextcloud_client=nextcloud_client,
score_threshold=score_threshold,
)
all_results.extend(results)
# Sort by score and limit
all_results.sort(key=lambda r: r.score, reverse=True)
search_results = all_results[:limit]
if not search_results:
return JSONResponse(
{
"success": True,
"results": [],
"coordinates_2d": [],
"message": "No results found",
}
)
# Fetch vectors for matching results from Qdrant
qdrant_client = await get_qdrant_client()
doc_ids = [r.id for r in search_results]
# Retrieve vectors for the matching documents
from qdrant_client.models import FieldCondition, Filter, MatchAny
points_response = await qdrant_client.scroll(
collection_name=settings.get_collection_name(),
scroll_filter=Filter(
must=[
FieldCondition(
key="doc_id",
match=MatchAny(any=[str(doc_id) for doc_id in doc_ids]),
),
FieldCondition(
key="user_id",
match={"value": username},
),
]
),
limit=len(doc_ids) * 2, # Account for multiple chunks per doc
with_vectors=True,
with_payload=False,
)
points = points_response[0]
if not points:
return JSONResponse(
{
"success": True,
"results": [],
"coordinates_2d": [],
"message": "No vectors found for results",
}
)
# Extract vectors
vectors = np.array([p.vector for p in points if p.vector is not None])
if len(vectors) < 2:
# Not enough points for PCA
return JSONResponse(
{
"success": True,
"results": [
{
"id": r.id,
"doc_type": r.doc_type,
"title": r.title,
"excerpt": r.excerpt,
"score": r.score,
}
for r in search_results
],
"coordinates_2d": [[0, 0]] * len(search_results),
"message": "Not enough vectors for PCA",
}
)
# Apply PCA dimensionality reduction (768-dim → 2D)
pca = PCA(n_components=2)
coords_2d = pca.fit_transform(vectors)
# After fit, these attributes are guaranteed to be set
assert pca.explained_variance_ratio_ is not None
logger.info(
f"PCA explained variance: PC1={pca.explained_variance_ratio_[0]:.3f}, "
f"PC2={pca.explained_variance_ratio_[1]:.3f}"
)
# Map results to coordinates (use first chunk per document)
result_coords = []
seen_doc_ids = set()
for point, coord in zip(points, coords_2d):
if point.payload:
doc_id = int(point.payload.get("doc_id", 0))
if doc_id not in seen_doc_ids and doc_id in doc_ids:
seen_doc_ids.add(doc_id)
result_coords.append(coord.tolist())
# Build response
response_results = [
{
"id": r.id,
"doc_type": r.doc_type,
"title": r.title,
"excerpt": r.excerpt,
"score": r.score,
}
for r in search_results
]
return JSONResponse(
{
"success": True,
"results": response_results,
"coordinates_2d": result_coords[: len(search_results)],
"pca_variance": {
"pc1": float(pca.explained_variance_ratio_[0]),
"pc2": float(pca.explained_variance_ratio_[1]),
},
}
)
except Exception as e:
logger.error(f"Viz search error: {e}", exc_info=True)
return JSONResponse(
{"success": False, "error": str(e)},
status_code=500,
)