97b48ca3dd
- Add Plotly.js 3D scatter plot showing search results in PCA space - Create shared visualization.py module to avoid code duplication - Pass include_pca parameter through API chain to enable coordinates - Fix OAuth redirects to use /settings/user/astroglobe The visualization shows document embeddings projected to 3D via PCA, with the query point highlighted in red. Uses Viridis colorscale for score visualization, matching the existing vector-viz page. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
191 lines
7.1 KiB
Python
191 lines
7.1 KiB
Python
"""Shared visualization utilities for PCA coordinate computation.
|
|
|
|
Extracts the PCA coordinate computation logic used by both:
|
|
- viz_routes.py (session-based auth)
|
|
- management.py (OAuth bearer token auth)
|
|
|
|
Both endpoints need to compute 3D PCA coordinates for search results,
|
|
so this module provides the shared implementation.
|
|
"""
|
|
|
|
import logging
|
|
from typing import Any
|
|
|
|
import anyio.to_thread
|
|
import numpy as np
|
|
|
|
from nextcloud_mcp_server.config import get_settings
|
|
from nextcloud_mcp_server.vector.pca import PCA
|
|
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def compute_pca_coordinates(
|
|
search_results: list[Any],
|
|
query_embedding: np.ndarray | list[float],
|
|
) -> dict[str, Any]:
|
|
"""Compute PCA 3D coordinates for search results visualization.
|
|
|
|
This is the shared implementation used by both viz_routes.py and
|
|
the management API. It retrieves vectors from Qdrant and applies
|
|
PCA dimensionality reduction.
|
|
|
|
Args:
|
|
search_results: List of SearchResult objects with point_id
|
|
query_embedding: The query embedding vector
|
|
|
|
Returns:
|
|
Dict with:
|
|
- coordinates_3d: List of [x, y, z] for each result
|
|
- query_coords: [x, y, z] for the query point
|
|
- pca_variance: Dict with pc1, pc2, pc3 explained variance ratios
|
|
"""
|
|
settings = get_settings()
|
|
|
|
# Collect point IDs from search results for batch retrieval
|
|
point_ids = [r.point_id for r in search_results if r.point_id]
|
|
|
|
if len(point_ids) < 2:
|
|
return {"coordinates_3d": [], "query_coords": []}
|
|
|
|
qdrant_client = await get_qdrant_client()
|
|
|
|
# Batch retrieve vectors from Qdrant
|
|
points_response = await qdrant_client.retrieve(
|
|
collection_name=settings.get_collection_name(),
|
|
ids=point_ids,
|
|
with_vectors=["dense"],
|
|
with_payload=["doc_id", "chunk_start_offset", "chunk_end_offset"],
|
|
)
|
|
|
|
# Build chunk_vectors_map from batch response
|
|
chunk_vectors_map: dict[tuple[Any, Any, Any], Any] = {}
|
|
for point in points_response:
|
|
if point.vector is not None:
|
|
# Extract dense vector (handle both named and unnamed vectors)
|
|
if isinstance(point.vector, dict):
|
|
vector = point.vector.get("dense")
|
|
else:
|
|
vector = point.vector
|
|
|
|
if vector is not None and point.payload:
|
|
doc_id = point.payload.get("doc_id")
|
|
chunk_start = point.payload.get("chunk_start_offset")
|
|
chunk_end = point.payload.get("chunk_end_offset")
|
|
chunk_key = (doc_id, chunk_start, chunk_end)
|
|
chunk_vectors_map[chunk_key] = vector
|
|
|
|
if len(chunk_vectors_map) < 2:
|
|
return {"coordinates_3d": [], "query_coords": []}
|
|
|
|
# Detect embedding dimension
|
|
embedding_dim = None
|
|
for vector in chunk_vectors_map.values():
|
|
if vector is not None:
|
|
embedding_dim = len(vector)
|
|
break
|
|
|
|
if embedding_dim is None:
|
|
return {"coordinates_3d": [], "query_coords": []}
|
|
|
|
logger.info(f"Detected embedding dimension: {embedding_dim}")
|
|
|
|
# Build chunk vectors array in search_results order (1:1 mapping)
|
|
chunk_vectors = []
|
|
for result in search_results:
|
|
chunk_key = (result.id, result.chunk_start_offset, result.chunk_end_offset)
|
|
if chunk_key in chunk_vectors_map:
|
|
chunk_vectors.append(chunk_vectors_map[chunk_key])
|
|
else:
|
|
# Chunk not found in vectors (shouldn't happen)
|
|
logger.warning(
|
|
f"Chunk {chunk_key} not found in fetched vectors, using zero vector"
|
|
)
|
|
chunk_vectors.append(np.zeros(embedding_dim))
|
|
|
|
chunk_vectors = np.array(chunk_vectors)
|
|
|
|
# Ensure query_embedding is a numpy array
|
|
if not isinstance(query_embedding, np.ndarray):
|
|
query_embedding = np.array(query_embedding)
|
|
|
|
# Combine query vector with chunk vectors for PCA
|
|
# Query will be the last point in the array
|
|
all_vectors = np.vstack([chunk_vectors, np.array([query_embedding])])
|
|
|
|
# Normalize vectors to unit length (L2 normalization)
|
|
# This is critical because Qdrant uses COSINE distance, which only measures
|
|
# vector direction (angle), not magnitude. PCA uses Euclidean distance which
|
|
# considers both direction and magnitude. By normalizing to unit length,
|
|
# Euclidean distances in PCA space will match cosine distances.
|
|
norms = np.linalg.norm(all_vectors, axis=1, keepdims=True)
|
|
|
|
# Check for zero-norm vectors (can happen with empty/corrupted embeddings)
|
|
zero_norm_mask = norms[:, 0] < 1e-10
|
|
if zero_norm_mask.any():
|
|
zero_indices = np.where(zero_norm_mask)[0]
|
|
logger.warning(
|
|
f"Found {zero_norm_mask.sum()} zero-norm vectors at indices "
|
|
f"{zero_indices.tolist()}. Replacing with small epsilon to avoid "
|
|
"division by zero."
|
|
)
|
|
# Replace zero norms with small epsilon to avoid NaN
|
|
norms[zero_norm_mask] = 1e-10
|
|
|
|
all_vectors_normalized = all_vectors / norms
|
|
logger.info(
|
|
f"Normalized vectors: query_norm={norms[-1][0]:.3f}, "
|
|
f"doc_norm_range=[{norms[:-1].min():.3f}, {norms[:-1].max():.3f}]"
|
|
)
|
|
|
|
# Apply PCA dimensionality reduction (768-dim → 3D)
|
|
# Run in thread pool to avoid blocking the event loop (CPU-bound)
|
|
def _compute_pca(vectors: np.ndarray) -> tuple[np.ndarray, PCA]:
|
|
pca = PCA(n_components=3)
|
|
coords = pca.fit_transform(vectors)
|
|
return coords, pca
|
|
|
|
coords_3d, pca = await anyio.to_thread.run_sync(
|
|
lambda: _compute_pca(all_vectors_normalized)
|
|
)
|
|
|
|
# After fit, these attributes are guaranteed to be set
|
|
assert pca.explained_variance_ratio_ is not None
|
|
|
|
# Check for NaN values in PCA output (numerical instability)
|
|
nan_mask = np.isnan(coords_3d)
|
|
if nan_mask.any():
|
|
nan_rows = np.where(nan_mask.any(axis=1))[0]
|
|
logger.error(
|
|
f"Found NaN values in PCA output at {len(nan_rows)} points: "
|
|
f"{nan_rows.tolist()[:10]}. Replacing NaN with 0.0 to prevent "
|
|
"JSON serialization error."
|
|
)
|
|
# Replace NaN with 0 to allow JSON serialization
|
|
coords_3d = np.nan_to_num(coords_3d, nan=0.0)
|
|
|
|
# Split query coords from chunk coords
|
|
# Round to 2 decimal places for cleaner display
|
|
query_coords_3d = [round(float(x), 2) for x in coords_3d[-1]] # Last point is query
|
|
chunk_coords_3d = coords_3d[:-1] # All but last are chunks
|
|
|
|
logger.info(
|
|
f"PCA explained variance: PC1={pca.explained_variance_ratio_[0]:.3f}, "
|
|
f"PC2={pca.explained_variance_ratio_[1]:.3f}, "
|
|
f"PC3={pca.explained_variance_ratio_[2]:.3f}"
|
|
)
|
|
|
|
# Coordinates already match search_results order (1:1 mapping)
|
|
result_coords = [[round(float(x), 2) for x in coord] for coord in chunk_coords_3d]
|
|
|
|
return {
|
|
"coordinates_3d": result_coords,
|
|
"query_coords": query_coords_3d,
|
|
"pca_variance": {
|
|
"pc1": float(pca.explained_variance_ratio_[0]),
|
|
"pc2": float(pca.explained_variance_ratio_[1]),
|
|
"pc3": float(pca.explained_variance_ratio_[2]),
|
|
},
|
|
}
|