feat: Implement per-chunk vector visualization with context expansion
Major improvements to vector visualization page: - Refactor PCA to display individual chunks instead of averaged documents - Add context expansion module for fetching surrounding text from notes and PDFs - Update deduplication to use (doc_id, doc_type, chunk_start, chunk_end) keys - Fix Alpine.js rendering with chunk-specific keys including offsets - Refactor authentication helper to return NextcloudClient for better reuse - Add async context manager support to NextcloudClient Technical details: - viz_routes.py: Fetch specific chunk vectors instead of averaging per document - context.py: New module supporting both notes and PDF text extraction via PyMuPDF - search algorithms: Extract page_number, chunk_index, total_chunks from Qdrant - vector-viz.js/html: Use chunk positions in expansion tracking keys This enables users to see which specific chunks match their query and view them with surrounding context in the PCA visualization. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -217,7 +217,7 @@ function vizApp() {
|
||||
},
|
||||
|
||||
async toggleChunk(result) {
|
||||
const resultKey = `${result.doc_type}_${result.id}`;
|
||||
const resultKey = `${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`;
|
||||
|
||||
if (this.isChunkExpanded(resultKey)) {
|
||||
delete this.expandedChunks[resultKey];
|
||||
|
||||
@@ -117,7 +117,7 @@
|
||||
|
||||
<template x-if="!loading && results.length > 0">
|
||||
<div x-transition.opacity.duration.200ms>
|
||||
<template x-for="result in results" :key="result.id">
|
||||
<template x-for="result in results" :key="`${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`">
|
||||
<div style="padding: 12px; border-bottom: 1px solid #eee;">
|
||||
<a :href="getNextcloudUrl(result)" target="_blank" style="font-weight: 500; color: #0066cc; text-decoration: none;">
|
||||
<span x-text="result.title"></span>
|
||||
@@ -134,22 +134,22 @@
|
||||
<button
|
||||
class="chunk-toggle-btn"
|
||||
@click="toggleChunk(result)"
|
||||
x-text="isChunkExpanded(`${result.doc_type}_${result.id}`) ? 'Hide Chunk' : 'Show Chunk'"
|
||||
x-text="isChunkExpanded(`${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`) ? 'Hide Chunk' : 'Show Chunk'"
|
||||
></button>
|
||||
</template>
|
||||
|
||||
<!-- Chunk context (expanded inline) -->
|
||||
<template x-if="isChunkExpanded(`${result.doc_type}_${result.id}`)">
|
||||
<template x-if="isChunkExpanded(`${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`)">
|
||||
<div class="chunk-context" x-transition.opacity.duration.200ms>
|
||||
<template x-if="chunkLoading[`${result.doc_type}_${result.id}`]">
|
||||
<template x-if="chunkLoading[`${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`]">
|
||||
<div style="color: #666; font-style: italic;">Loading chunk...</div>
|
||||
</template>
|
||||
<template x-if="!chunkLoading[`${result.doc_type}_${result.id}`]">
|
||||
<template x-if="!chunkLoading[`${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`]">
|
||||
<div>
|
||||
<template x-if="expandedChunks[`${result.doc_type}_${result.id}`]?.has_more_before">
|
||||
<template x-if="expandedChunks[`${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`]?.has_more_before">
|
||||
<span class="chunk-ellipsis">...</span>
|
||||
</template>
|
||||
<span class="chunk-text" x-text="expandedChunks[`${result.doc_type}_${result.id}`]?.before_context"></span><span class="chunk-matched" x-text="expandedChunks[`${result.doc_type}_${result.id}`]?.chunk_text"></span><span class="chunk-text" x-text="expandedChunks[`${result.doc_type}_${result.id}`]?.after_context"></span><template x-if="expandedChunks[`${result.doc_type}_${result.id}`]?.has_more_after">
|
||||
<span class="chunk-text" x-text="expandedChunks[`${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`]?.before_context"></span><span class="chunk-matched" x-text="expandedChunks[`${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`]?.chunk_text"></span><span class="chunk-text" x-text="expandedChunks[`${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`]?.after_context"></span><template x-if="expandedChunks[`${result.doc_type}_${result.id}_${result.chunk_start_offset || 0}`]?.has_more_after">
|
||||
<span class="chunk-ellipsis">...</span>
|
||||
</template>
|
||||
</div>
|
||||
|
||||
@@ -18,6 +18,8 @@ from starlette.authentication import requires
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import HTMLResponse, JSONResponse
|
||||
|
||||
from nextcloud_mcp_server.client import NextcloudClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Setup Jinja2 environment for templates
|
||||
@@ -25,14 +27,20 @@ _template_dir = Path(__file__).parent / "templates"
|
||||
_jinja_env = Environment(loader=FileSystemLoader(_template_dir))
|
||||
|
||||
|
||||
async def _get_authenticated_client_for_userinfo(request: Request) -> httpx.AsyncClient:
|
||||
"""Get an authenticated HTTP client for user info page operations.
|
||||
async def _get_authenticated_client_for_userinfo(request: Request) -> NextcloudClient:
|
||||
"""Get an authenticated Nextcloud client for user info page operations.
|
||||
|
||||
This is a shared helper for authenticated routes that need to access
|
||||
Nextcloud APIs. It handles both BasicAuth and OAuth authentication modes.
|
||||
|
||||
Args:
|
||||
request: Starlette request object
|
||||
|
||||
Returns:
|
||||
Authenticated httpx.AsyncClient
|
||||
Authenticated NextcloudClient
|
||||
|
||||
Raises:
|
||||
RuntimeError: If credentials/session not configured
|
||||
"""
|
||||
oauth_ctx = getattr(request.app.state, "oauth_context", None)
|
||||
|
||||
@@ -45,11 +53,15 @@ async def _get_authenticated_client_for_userinfo(request: Request) -> httpx.Asyn
|
||||
if not all([nextcloud_host, username, password]):
|
||||
raise RuntimeError("BasicAuth credentials not configured")
|
||||
|
||||
assert nextcloud_host is not None # Type narrowing for type checker
|
||||
return httpx.AsyncClient(
|
||||
from httpx import BasicAuth
|
||||
|
||||
assert nextcloud_host is not None
|
||||
assert username is not None
|
||||
assert password is not None
|
||||
return NextcloudClient(
|
||||
base_url=nextcloud_host,
|
||||
auth=(username, password),
|
||||
timeout=30.0,
|
||||
username=username,
|
||||
auth=BasicAuth(username, password),
|
||||
)
|
||||
|
||||
# OAuth mode - get token from session
|
||||
@@ -64,15 +76,14 @@ async def _get_authenticated_client_for_userinfo(request: Request) -> httpx.Asyn
|
||||
raise RuntimeError("No access token found in session")
|
||||
|
||||
access_token = token_data["access_token"]
|
||||
username = token_data.get("username")
|
||||
nextcloud_host = oauth_ctx.get("config", {}).get("nextcloud_host", "")
|
||||
|
||||
if not nextcloud_host:
|
||||
raise RuntimeError("Nextcloud host not configured")
|
||||
if not nextcloud_host or not username:
|
||||
raise RuntimeError("Nextcloud host or username not configured")
|
||||
|
||||
return httpx.AsyncClient(
|
||||
base_url=nextcloud_host,
|
||||
headers={"Authorization": f"Bearer {access_token}"},
|
||||
timeout=30.0,
|
||||
return NextcloudClient.from_token(
|
||||
base_url=nextcloud_host, token=access_token, username=username
|
||||
)
|
||||
|
||||
|
||||
@@ -423,10 +434,10 @@ async def user_info_html(request: Request) -> HTMLResponse:
|
||||
try:
|
||||
from nextcloud_mcp_server.auth.permissions import is_nextcloud_admin
|
||||
|
||||
# Get authenticated HTTP client
|
||||
http_client = await _get_authenticated_client_for_userinfo(request)
|
||||
is_admin = await is_nextcloud_admin(request, http_client)
|
||||
await http_client.aclose()
|
||||
# Get authenticated Nextcloud client
|
||||
nc_client = await _get_authenticated_client_for_userinfo(request)
|
||||
is_admin = await is_nextcloud_admin(request, nc_client._client)
|
||||
await nc_client.close()
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to check admin status: {e}")
|
||||
# Default to not admin if check fails
|
||||
|
||||
@@ -138,7 +138,7 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
|
||||
_get_authenticated_client_for_userinfo,
|
||||
)
|
||||
|
||||
async with await _get_authenticated_client_for_userinfo(request) as http_client: # noqa: F841
|
||||
async with await _get_authenticated_client_for_userinfo(request) as nc_client: # noqa: F841
|
||||
# Create search algorithm (no client needed - verification removed)
|
||||
if algorithm == "semantic":
|
||||
search_algo = SemanticSearchAlgorithm(score_threshold=score_threshold)
|
||||
@@ -217,72 +217,75 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
|
||||
}
|
||||
)
|
||||
|
||||
# Fetch vectors for matching results from Qdrant
|
||||
# Fetch vectors for specific matching chunks from Qdrant
|
||||
vector_fetch_start = time.perf_counter()
|
||||
qdrant_client = await get_qdrant_client()
|
||||
doc_ids = [r.id for r in search_results]
|
||||
|
||||
# Retrieve vectors for the matching documents
|
||||
from qdrant_client.models import FieldCondition, Filter, MatchAny
|
||||
# Build filters for each specific chunk
|
||||
from qdrant_client.models import FieldCondition, Filter, MatchValue
|
||||
|
||||
points_response = await qdrant_client.scroll(
|
||||
collection_name=settings.get_collection_name(),
|
||||
scroll_filter=Filter(
|
||||
must=[
|
||||
chunk_vectors_map = {} # Map (doc_id, chunk_start, chunk_end) -> vector
|
||||
|
||||
# Fetch vectors in batches by filtering on chunk-specific fields
|
||||
for result in search_results:
|
||||
chunk_start = result.chunk_start_offset
|
||||
chunk_end = result.chunk_end_offset
|
||||
|
||||
# Build filter for this specific chunk
|
||||
must_conditions = [
|
||||
FieldCondition(
|
||||
key="doc_id",
|
||||
match=MatchValue(value=str(result.id)),
|
||||
),
|
||||
FieldCondition(
|
||||
key="user_id",
|
||||
match=MatchValue(value=username),
|
||||
),
|
||||
]
|
||||
|
||||
# Add chunk position filters if available
|
||||
if chunk_start is not None:
|
||||
must_conditions.append(
|
||||
FieldCondition(
|
||||
key="doc_id",
|
||||
match=MatchAny(any=[str(doc_id) for doc_id in doc_ids]),
|
||||
),
|
||||
key="chunk_start_offset",
|
||||
match=MatchValue(value=chunk_start),
|
||||
)
|
||||
)
|
||||
if chunk_end is not None:
|
||||
must_conditions.append(
|
||||
FieldCondition(
|
||||
key="user_id",
|
||||
match={"value": username},
|
||||
),
|
||||
]
|
||||
),
|
||||
limit=len(doc_ids) * 2, # Account for multiple chunks per doc
|
||||
with_vectors=["dense"], # Only fetch dense vectors for visualization
|
||||
with_payload=["doc_id"], # Need doc_id to map vectors to results
|
||||
)
|
||||
key="chunk_end_offset",
|
||||
match=MatchValue(value=chunk_end),
|
||||
)
|
||||
)
|
||||
|
||||
points = points_response[0]
|
||||
|
||||
if not points:
|
||||
return JSONResponse(
|
||||
{
|
||||
"success": True,
|
||||
"results": [],
|
||||
"coordinates_2d": [],
|
||||
"message": "No vectors found for results",
|
||||
}
|
||||
# Fetch this specific chunk vector
|
||||
points_response = await qdrant_client.scroll(
|
||||
collection_name=settings.get_collection_name(),
|
||||
scroll_filter=Filter(must=must_conditions),
|
||||
limit=1, # Only need the first match
|
||||
with_vectors=["dense"],
|
||||
with_payload=False,
|
||||
)
|
||||
|
||||
# Extract dense vectors and group by document
|
||||
def extract_dense_vector(point):
|
||||
if point.vector is None:
|
||||
return None
|
||||
# If named vectors (dict), extract "dense"
|
||||
if isinstance(point.vector, dict):
|
||||
return point.vector.get("dense")
|
||||
# If unnamed vector (array), use directly
|
||||
return point.vector
|
||||
points = points_response[0]
|
||||
if points:
|
||||
# Extract dense vector
|
||||
point = points[0]
|
||||
if point.vector is not None:
|
||||
# If named vectors (dict), extract "dense"
|
||||
if isinstance(point.vector, dict):
|
||||
vector = point.vector.get("dense")
|
||||
else:
|
||||
vector = point.vector
|
||||
|
||||
# Group chunk vectors by doc_id
|
||||
from collections import defaultdict
|
||||
|
||||
doc_chunks = defaultdict(list)
|
||||
for point in points:
|
||||
if point.payload:
|
||||
# doc_id can be int (for notes) or str (for files - file path)
|
||||
# Keep original type instead of forcing to int
|
||||
doc_id = point.payload.get("doc_id", 0)
|
||||
vector = extract_dense_vector(point)
|
||||
if vector is not None:
|
||||
doc_chunks[doc_id].append(vector)
|
||||
chunk_key = (result.id, chunk_start, chunk_end)
|
||||
chunk_vectors_map[chunk_key] = vector
|
||||
|
||||
vector_fetch_duration = time.perf_counter() - vector_fetch_start
|
||||
|
||||
if len(doc_chunks) < 2:
|
||||
# Not enough documents for PCA
|
||||
if len(chunk_vectors_map) < 2:
|
||||
# Not enough chunks for PCA
|
||||
return JSONResponse(
|
||||
{
|
||||
"success": True,
|
||||
@@ -298,15 +301,15 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
|
||||
],
|
||||
"coordinates_3d": [[0, 0, 0]] * len(search_results),
|
||||
"query_coords": [0, 0, 0],
|
||||
"message": "Not enough documents for PCA",
|
||||
"message": "Not enough chunks for PCA",
|
||||
}
|
||||
)
|
||||
|
||||
# Detect embedding dimension from first available vector
|
||||
embedding_dim = None
|
||||
for chunks in doc_chunks.values():
|
||||
if chunks:
|
||||
embedding_dim = len(chunks[0])
|
||||
for vector in chunk_vectors_map.values():
|
||||
if vector is not None:
|
||||
embedding_dim = len(vector)
|
||||
break
|
||||
|
||||
if embedding_dim is None:
|
||||
@@ -320,23 +323,21 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
|
||||
|
||||
logger.info(f"Detected embedding dimension: {embedding_dim}")
|
||||
|
||||
# Average chunk vectors per document to create document-level embeddings
|
||||
# Maintain order of search_results for coordinate mapping
|
||||
doc_vectors = []
|
||||
# Build chunk vectors array in search_results order (1:1 mapping)
|
||||
chunk_vectors = []
|
||||
for result in search_results:
|
||||
if result.id in doc_chunks:
|
||||
# Average all chunk embeddings for this document
|
||||
chunk_vectors = np.array(doc_chunks[result.id])
|
||||
avg_vector = np.mean(chunk_vectors, axis=0)
|
||||
doc_vectors.append(avg_vector)
|
||||
logger.debug(f"Doc {result.id}: averaged {len(chunk_vectors)} chunks")
|
||||
chunk_key = (result.id, result.chunk_start_offset, result.chunk_end_offset)
|
||||
if chunk_key in chunk_vectors_map:
|
||||
chunk_vectors.append(chunk_vectors_map[chunk_key])
|
||||
else:
|
||||
# Document not found in vectors (shouldn't happen)
|
||||
logger.warning(f"Doc {result.id} not found in fetched vectors")
|
||||
# Use zero vector as fallback with detected dimension
|
||||
doc_vectors.append(np.zeros(embedding_dim))
|
||||
# Chunk not found in vectors (shouldn't happen)
|
||||
logger.warning(
|
||||
f"Chunk {chunk_key} not found in fetched vectors, using zero vector"
|
||||
)
|
||||
# Use zero vector as fallback
|
||||
chunk_vectors.append(np.zeros(embedding_dim))
|
||||
|
||||
doc_vectors = np.array(doc_vectors)
|
||||
chunk_vectors = np.array(chunk_vectors)
|
||||
|
||||
# Generate query embedding for visualization
|
||||
query_embed_start = time.perf_counter()
|
||||
@@ -348,9 +349,9 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
|
||||
|
||||
logger.info(f"Generated query embedding (dimension={len(query_embedding)})")
|
||||
|
||||
# Combine query vector with document vectors for PCA
|
||||
# Combine query vector with chunk vectors for PCA
|
||||
# Query will be the last point in the array
|
||||
all_vectors = np.vstack([doc_vectors, np.array([query_embedding])])
|
||||
all_vectors = np.vstack([chunk_vectors, np.array([query_embedding])])
|
||||
|
||||
# Normalize vectors to unit length (L2 normalization)
|
||||
# This is critical because Qdrant uses COSINE distance, which only measures
|
||||
@@ -396,17 +397,12 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
|
||||
# Replace NaN with 0 to allow JSON serialization
|
||||
coords_3d = np.nan_to_num(coords_3d, nan=0.0)
|
||||
|
||||
# Split query coords from document coords
|
||||
# Split query coords from chunk coords
|
||||
# Round to 2 decimal places for cleaner display
|
||||
query_coords_3d = [
|
||||
round(float(x), 2) for x in coords_3d[-1]
|
||||
] # Last point is query
|
||||
doc_coords_3d = coords_3d[:-1] # All but last are documents
|
||||
|
||||
total_chunks = sum(len(chunks) for chunks in doc_chunks.values())
|
||||
avg_chunks_per_doc = (
|
||||
total_chunks / len(doc_vectors) if doc_vectors.size > 0 else 0
|
||||
)
|
||||
chunk_coords_3d = coords_3d[:-1] # All but last are chunks
|
||||
|
||||
logger.info(
|
||||
f"PCA explained variance: PC1={pca.explained_variance_ratio_[0]:.3f}, "
|
||||
@@ -414,13 +410,14 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
|
||||
f"PC3={pca.explained_variance_ratio_[2]:.3f}"
|
||||
)
|
||||
logger.info(
|
||||
f"Embedding stats: documents={len(doc_vectors)}, "
|
||||
f"total_chunks={total_chunks}, avg_chunks_per_doc={avg_chunks_per_doc:.1f}, "
|
||||
f"query_dim={len(query_embedding)}, doc_vector_dim={doc_vectors.shape[1] if doc_vectors.size > 0 else 0}"
|
||||
f"Embedding stats: chunks={len(chunk_vectors)}, "
|
||||
f"query_dim={len(query_embedding)}, chunk_vector_dim={chunk_vectors.shape[1] if chunk_vectors.size > 0 else 0}"
|
||||
)
|
||||
|
||||
# Coordinates already match search_results order (1:1 mapping)
|
||||
result_coords = [[round(float(x), 2) for x in coord] for coord in doc_coords_3d]
|
||||
result_coords = [
|
||||
[round(float(x), 2) for x in coord] for coord in chunk_coords_3d
|
||||
]
|
||||
|
||||
# Build response
|
||||
response_results = [
|
||||
@@ -449,7 +446,7 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
|
||||
f"vector_fetch={vector_fetch_duration * 1000:.1f}ms ({vector_fetch_duration / total_duration * 100:.1f}%), "
|
||||
f"query_embed={query_embed_duration * 1000:.1f}ms ({query_embed_duration / total_duration * 100:.1f}%), "
|
||||
f"pca={pca_duration * 1000:.1f}ms ({pca_duration / total_duration * 100:.1f}%), "
|
||||
f"results={len(search_results)}, doc_vectors={len(doc_vectors)}"
|
||||
f"results={len(search_results)}, chunk_vectors={len(chunk_vectors)}"
|
||||
)
|
||||
|
||||
return JSONResponse(
|
||||
@@ -470,7 +467,7 @@ async def vector_visualization_search(request: Request) -> JSONResponse:
|
||||
"query_embed_ms": round(query_embed_duration * 1000, 2),
|
||||
"pca_ms": round(pca_duration * 1000, 2),
|
||||
"num_results": len(search_results),
|
||||
"num_doc_vectors": len(doc_vectors),
|
||||
"num_chunk_vectors": len(chunk_vectors),
|
||||
},
|
||||
}
|
||||
)
|
||||
@@ -519,75 +516,59 @@ async def chunk_context_endpoint(request: Request) -> JSONResponse:
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
# Type assertions - we validated these above
|
||||
assert doc_type is not None
|
||||
assert doc_id is not None
|
||||
assert start_str is not None
|
||||
assert end_str is not None
|
||||
|
||||
start = int(start_str)
|
||||
end = int(end_str)
|
||||
|
||||
# Currently only support notes
|
||||
if doc_type != "note":
|
||||
return JSONResponse(
|
||||
{"success": False, "error": f"Unsupported doc_type: {doc_type}"},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
# Get authenticated HTTP client and fetch note
|
||||
# Get authenticated Nextcloud client
|
||||
from nextcloud_mcp_server.auth.userinfo_routes import (
|
||||
_get_authenticated_client_for_userinfo,
|
||||
)
|
||||
from nextcloud_mcp_server.client.notes import NotesClient
|
||||
from nextcloud_mcp_server.search.context import get_chunk_with_context
|
||||
|
||||
# Get username from request auth
|
||||
username = (
|
||||
request.user.display_name
|
||||
if hasattr(request.user, "display_name")
|
||||
else "unknown"
|
||||
)
|
||||
# Use context expansion module to fetch chunk with surrounding context
|
||||
async with await _get_authenticated_client_for_userinfo(request) as nc_client:
|
||||
chunk_context = await get_chunk_with_context(
|
||||
nc_client=nc_client,
|
||||
user_id=request.user.display_name, # User ID from auth
|
||||
doc_id=doc_id,
|
||||
doc_type=doc_type,
|
||||
chunk_start=start,
|
||||
chunk_end=end,
|
||||
context_chars=context_chars,
|
||||
)
|
||||
|
||||
# Create notes client with authenticated HTTP client
|
||||
http_client = await _get_authenticated_client_for_userinfo(request)
|
||||
notes_client = NotesClient(http_client, username)
|
||||
|
||||
# Fetch full note content
|
||||
note = await notes_client.get_note(int(doc_id))
|
||||
full_content = f"{note['title']}\n\n{note['content']}"
|
||||
|
||||
# Validate offsets
|
||||
if start < 0 or end > len(full_content) or start >= end:
|
||||
# Check if context expansion succeeded
|
||||
if chunk_context is None:
|
||||
return JSONResponse(
|
||||
{
|
||||
"success": False,
|
||||
"error": f"Invalid offsets: start={start}, end={end}, content_length={len(full_content)}",
|
||||
"error": f"Failed to fetch chunk context for {doc_type} {doc_id}",
|
||||
},
|
||||
status_code=400,
|
||||
status_code=404,
|
||||
)
|
||||
|
||||
# Extract chunk
|
||||
chunk_text = full_content[start:end]
|
||||
|
||||
# Extract context before and after
|
||||
before_start = max(0, start - context_chars)
|
||||
before_context = full_content[before_start:start]
|
||||
|
||||
after_end = min(len(full_content), end + context_chars)
|
||||
after_context = full_content[end:after_end]
|
||||
|
||||
# Determine if there's more content
|
||||
has_more_before = before_start > 0
|
||||
has_more_after = after_end < len(full_content)
|
||||
|
||||
logger.info(
|
||||
f"Fetched chunk context for {doc_type}_{doc_id}: "
|
||||
f"chunk_len={len(chunk_text)}, before_len={len(before_context)}, "
|
||||
f"after_len={len(after_context)}"
|
||||
f"chunk_len={len(chunk_context.chunk_text)}, "
|
||||
f"before_len={len(chunk_context.before_context)}, "
|
||||
f"after_len={len(chunk_context.after_context)}"
|
||||
)
|
||||
|
||||
# Return response compatible with frontend expectations
|
||||
return JSONResponse(
|
||||
{
|
||||
"success": True,
|
||||
"chunk_text": chunk_text,
|
||||
"before_context": before_context,
|
||||
"after_context": after_context,
|
||||
"has_more_before": has_more_before,
|
||||
"has_more_after": has_more_after,
|
||||
"chunk_text": chunk_context.chunk_text,
|
||||
"before_context": chunk_context.before_context,
|
||||
"after_context": chunk_context.after_context,
|
||||
"has_more_before": chunk_context.has_before_truncation,
|
||||
"has_more_after": chunk_context.has_after_truncation,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -190,6 +190,15 @@ class NextcloudClient:
|
||||
"""Helper to get the base WebDAV path for the authenticated user."""
|
||||
return f"/remote.php/dav/files/{self.username}"
|
||||
|
||||
async def __aenter__(self):
|
||||
"""Async context manager entry."""
|
||||
return self
|
||||
|
||||
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Async context manager exit - closes all clients."""
|
||||
await self.close()
|
||||
return False # Don't suppress exceptions
|
||||
|
||||
async def close(self):
|
||||
"""Close the HTTP client and CalDAV client."""
|
||||
await self._client.aclose()
|
||||
|
||||
@@ -35,6 +35,29 @@ class SemanticSearchResult(BaseModel):
|
||||
chunk_end_offset: Optional[int] = Field(
|
||||
default=None, description="Character position where chunk ends in document"
|
||||
)
|
||||
page_number: Optional[int] = Field(
|
||||
default=None, description="Page number for PDF documents"
|
||||
)
|
||||
# Context expansion fields (optional, populated when include_context=True)
|
||||
has_context_expansion: bool = Field(
|
||||
default=False, description="Whether context expansion was performed"
|
||||
)
|
||||
marked_text: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Full text with position markers around matched chunk",
|
||||
)
|
||||
before_context: Optional[str] = Field(
|
||||
default=None, description="Text before the matched chunk"
|
||||
)
|
||||
after_context: Optional[str] = Field(
|
||||
default=None, description="Text after the matched chunk"
|
||||
)
|
||||
has_before_truncation: Optional[bool] = Field(
|
||||
default=None, description="Whether before_context was truncated"
|
||||
)
|
||||
has_after_truncation: Optional[bool] = Field(
|
||||
default=None, description="Whether after_context was truncated"
|
||||
)
|
||||
|
||||
|
||||
class SemanticSearchResponse(BaseResponse):
|
||||
|
||||
@@ -133,6 +133,9 @@ class SearchResult:
|
||||
metadata: Additional algorithm-specific metadata
|
||||
chunk_start_offset: Character position where chunk starts (None if not available)
|
||||
chunk_end_offset: Character position where chunk ends (None if not available)
|
||||
page_number: Page number for PDF documents (None for other doc types)
|
||||
chunk_index: Zero-based index of this chunk in the document
|
||||
total_chunks: Total number of chunks in the document
|
||||
"""
|
||||
|
||||
id: int
|
||||
@@ -143,6 +146,9 @@ class SearchResult:
|
||||
metadata: dict[str, Any] | None = None
|
||||
chunk_start_offset: int | None = None
|
||||
chunk_end_offset: int | None = None
|
||||
page_number: int | None = None
|
||||
chunk_index: int = 0
|
||||
total_chunks: int = 1
|
||||
|
||||
def __post_init__(self):
|
||||
"""Validate score is non-negative.
|
||||
|
||||
@@ -72,6 +72,9 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
|
||||
Returns unverified results from Qdrant. Access verification should be
|
||||
performed separately at the final output stage using verify_search_results().
|
||||
|
||||
Deduplicates by (doc_id, doc_type, chunk_start_offset, chunk_end_offset)
|
||||
to show multiple chunks from the same document while avoiding duplicate chunks.
|
||||
|
||||
Args:
|
||||
query: Natural language or keyword search query
|
||||
user_id: User ID for filtering
|
||||
@@ -176,21 +179,24 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
|
||||
f"Top 3 {self.fusion_name.upper()} fusion scores: {top_scores}"
|
||||
)
|
||||
|
||||
# Deduplicate by (doc_id, doc_type) - multiple chunks per document
|
||||
seen_docs = set()
|
||||
# Deduplicate by (doc_id, doc_type, chunk_start, chunk_end)
|
||||
# This allows multiple chunks from same doc, but removes duplicate chunks
|
||||
seen_chunks = set()
|
||||
results = []
|
||||
|
||||
for result in search_response.points:
|
||||
# doc_id can be int (notes) or str (files - file paths)
|
||||
doc_id = result.payload["doc_id"]
|
||||
doc_type = result.payload.get("doc_type", "note")
|
||||
doc_key = (doc_id, doc_type)
|
||||
chunk_start = result.payload.get("chunk_start_offset")
|
||||
chunk_end = result.payload.get("chunk_end_offset")
|
||||
chunk_key = (doc_id, doc_type, chunk_start, chunk_end)
|
||||
|
||||
# Skip if we've already seen this document
|
||||
if doc_key in seen_docs:
|
||||
# Skip if we've already seen this exact chunk
|
||||
if chunk_key in seen_chunks:
|
||||
continue
|
||||
|
||||
seen_docs.add(doc_key)
|
||||
seen_chunks.add(chunk_key)
|
||||
|
||||
# Return unverified results (verification happens at output stage)
|
||||
results.append(
|
||||
@@ -207,6 +213,9 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
|
||||
},
|
||||
chunk_start_offset=result.payload.get("chunk_start_offset"),
|
||||
chunk_end_offset=result.payload.get("chunk_end_offset"),
|
||||
page_number=result.payload.get("page_number"),
|
||||
chunk_index=result.payload.get("chunk_index", 0),
|
||||
total_chunks=result.payload.get("total_chunks", 1),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@@ -0,0 +1,265 @@
|
||||
"""Context expansion for search results.
|
||||
|
||||
Provides utilities to expand matched chunks with surrounding context and
|
||||
position markers for better visualization and understanding of search results.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
|
||||
from nextcloud_mcp_server.client import NextcloudClient
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ChunkContext:
|
||||
"""Expanded chunk with surrounding context and position markers.
|
||||
|
||||
Attributes:
|
||||
chunk_text: The matched chunk text
|
||||
before_context: Text before the chunk (up to context_chars)
|
||||
after_context: Text after the chunk (up to context_chars)
|
||||
chunk_start_offset: Character position where chunk starts in document
|
||||
chunk_end_offset: Character position where chunk ends in document
|
||||
page_number: Page number for PDFs (None for other doc types)
|
||||
chunk_index: Zero-based chunk index (N in "chunk N of M")
|
||||
total_chunks: Total number of chunks in document
|
||||
marked_text: Full text with position markers around the chunk
|
||||
has_before_truncation: True if before_context was truncated
|
||||
has_after_truncation: True if after_context was truncated
|
||||
"""
|
||||
|
||||
chunk_text: str
|
||||
before_context: str
|
||||
after_context: str
|
||||
chunk_start_offset: int
|
||||
chunk_end_offset: int
|
||||
page_number: int | None
|
||||
chunk_index: int
|
||||
total_chunks: int
|
||||
marked_text: str
|
||||
has_before_truncation: bool
|
||||
has_after_truncation: bool
|
||||
|
||||
|
||||
async def get_chunk_with_context(
|
||||
nc_client: NextcloudClient,
|
||||
user_id: str,
|
||||
doc_id: str | int,
|
||||
doc_type: str,
|
||||
chunk_start: int,
|
||||
chunk_end: int,
|
||||
page_number: int | None = None,
|
||||
chunk_index: int = 0,
|
||||
total_chunks: int = 1,
|
||||
context_chars: int = 300,
|
||||
) -> ChunkContext | None:
|
||||
"""Fetch chunk with surrounding context from original document.
|
||||
|
||||
Retrieves the full document text and expands the matched chunk to include
|
||||
surrounding context for better understanding. Inserts position markers
|
||||
around the chunk for visualization.
|
||||
|
||||
Args:
|
||||
nc_client: Authenticated Nextcloud client
|
||||
user_id: User ID who owns the document
|
||||
doc_id: Document ID (note ID or file path)
|
||||
doc_type: Type of document ("note", "file", etc.)
|
||||
chunk_start: Character offset where chunk starts
|
||||
chunk_end: Character offset where chunk ends
|
||||
page_number: Optional page number for PDFs
|
||||
chunk_index: Zero-based chunk index in document
|
||||
total_chunks: Total number of chunks in document
|
||||
context_chars: Number of characters to include before/after chunk
|
||||
|
||||
Returns:
|
||||
ChunkContext with expanded context and markers, or None if document
|
||||
cannot be retrieved
|
||||
"""
|
||||
# Fetch full document text
|
||||
full_text = await _fetch_document_text(nc_client, doc_id, doc_type)
|
||||
if full_text is None:
|
||||
logger.warning(
|
||||
f"Could not fetch document text for {doc_type} {doc_id}, "
|
||||
"skipping context expansion"
|
||||
)
|
||||
return None
|
||||
|
||||
# Validate offsets
|
||||
if chunk_start < 0 or chunk_end > len(full_text) or chunk_start >= chunk_end:
|
||||
logger.warning(
|
||||
f"Invalid chunk offsets for {doc_type} {doc_id}: "
|
||||
f"start={chunk_start}, end={chunk_end}, doc_len={len(full_text)}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Extract chunk text
|
||||
chunk_text = full_text[chunk_start:chunk_end]
|
||||
|
||||
# Calculate context boundaries
|
||||
context_start = max(0, chunk_start - context_chars)
|
||||
context_end = min(len(full_text), chunk_end + context_chars)
|
||||
|
||||
# Extract context
|
||||
before_context = full_text[context_start:chunk_start]
|
||||
after_context = full_text[chunk_end:context_end]
|
||||
|
||||
# Check for truncation
|
||||
has_before_truncation = context_start > 0
|
||||
has_after_truncation = context_end < len(full_text)
|
||||
|
||||
# Create marked text with position markers
|
||||
marked_text = _insert_position_markers(
|
||||
before_context=before_context,
|
||||
chunk_text=chunk_text,
|
||||
after_context=after_context,
|
||||
page_number=page_number,
|
||||
chunk_index=chunk_index,
|
||||
total_chunks=total_chunks,
|
||||
has_before_truncation=has_before_truncation,
|
||||
has_after_truncation=has_after_truncation,
|
||||
)
|
||||
|
||||
return ChunkContext(
|
||||
chunk_text=chunk_text,
|
||||
before_context=before_context,
|
||||
after_context=after_context,
|
||||
chunk_start_offset=chunk_start,
|
||||
chunk_end_offset=chunk_end,
|
||||
page_number=page_number,
|
||||
chunk_index=chunk_index,
|
||||
total_chunks=total_chunks,
|
||||
marked_text=marked_text,
|
||||
has_before_truncation=has_before_truncation,
|
||||
has_after_truncation=has_after_truncation,
|
||||
)
|
||||
|
||||
|
||||
async def _fetch_document_text(
|
||||
nc_client: NextcloudClient, doc_id: str | int, doc_type: str
|
||||
) -> str | None:
|
||||
"""Fetch full text content of a document.
|
||||
|
||||
Args:
|
||||
nc_client: Authenticated Nextcloud client
|
||||
doc_id: Document ID (note ID or file path)
|
||||
doc_type: Type of document ("note", "file", etc.)
|
||||
|
||||
Returns:
|
||||
Full document text, or None if document cannot be retrieved
|
||||
"""
|
||||
try:
|
||||
if doc_type == "note":
|
||||
# Fetch note by ID
|
||||
note = await nc_client.notes.get_note(note_id=int(doc_id))
|
||||
return note.get("content", "")
|
||||
elif doc_type == "file":
|
||||
# Fetch file content via WebDAV
|
||||
try:
|
||||
file_path = str(doc_id)
|
||||
file_content, content_type = await nc_client.webdav.read_file(file_path)
|
||||
|
||||
# Check if it's a PDF (by content type or file extension)
|
||||
is_pdf = (
|
||||
content_type and "pdf" in content_type.lower()
|
||||
) or file_path.lower().endswith(".pdf")
|
||||
|
||||
if is_pdf:
|
||||
# Extract text from PDF using PyMuPDF
|
||||
import fitz # PyMuPDF
|
||||
|
||||
logger.debug(f"Extracting text from PDF: {file_path}")
|
||||
pdf_doc = fitz.open(stream=file_content, filetype="pdf")
|
||||
text_parts = []
|
||||
for page in pdf_doc:
|
||||
text_parts.append(page.get_text())
|
||||
pdf_doc.close()
|
||||
|
||||
full_text = "\n".join(text_parts)
|
||||
logger.debug(
|
||||
f"Extracted {len(full_text)} characters from "
|
||||
f"{len(text_parts)} pages in {file_path}"
|
||||
)
|
||||
return full_text
|
||||
else:
|
||||
# Assume it's a text file, decode to string
|
||||
logger.debug(f"Decoding text file: {file_path}")
|
||||
return file_content.decode("utf-8", errors="replace")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Error fetching file content for {doc_id}: {e}", exc_info=True
|
||||
)
|
||||
return None
|
||||
else:
|
||||
logger.warning(f"Unsupported doc_type for context expansion: {doc_type}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching document {doc_type} {doc_id}: {e}", exc_info=True)
|
||||
return None
|
||||
|
||||
|
||||
def _insert_position_markers(
|
||||
before_context: str,
|
||||
chunk_text: str,
|
||||
after_context: str,
|
||||
page_number: int | None,
|
||||
chunk_index: int,
|
||||
total_chunks: int,
|
||||
has_before_truncation: bool,
|
||||
has_after_truncation: bool,
|
||||
) -> str:
|
||||
"""Insert position markers around matched chunk.
|
||||
|
||||
Creates markdown-formatted text with visual markers indicating chunk
|
||||
boundaries and metadata.
|
||||
|
||||
Args:
|
||||
before_context: Text before chunk
|
||||
chunk_text: The matched chunk
|
||||
after_context: Text after chunk
|
||||
page_number: Optional page number
|
||||
chunk_index: Zero-based chunk index
|
||||
total_chunks: Total chunks in document
|
||||
has_before_truncation: Whether before_context is truncated
|
||||
has_after_truncation: Whether after_context is truncated
|
||||
|
||||
Returns:
|
||||
Formatted text with position markers
|
||||
"""
|
||||
# Build position metadata
|
||||
position_parts = []
|
||||
if page_number is not None:
|
||||
position_parts.append(f"Page {page_number}")
|
||||
position_parts.append(f"Chunk {chunk_index + 1} of {total_chunks}")
|
||||
position_metadata = ", ".join(position_parts)
|
||||
|
||||
# Build marked text
|
||||
parts = []
|
||||
|
||||
# Add truncation indicator for before context
|
||||
if has_before_truncation:
|
||||
parts.append("**[...]**\n\n")
|
||||
|
||||
# Add before context if present
|
||||
if before_context:
|
||||
parts.append(before_context)
|
||||
|
||||
# Add chunk start marker
|
||||
parts.append(f"\n\n🔍 **MATCHED CHUNK START** ({position_metadata})\n\n")
|
||||
|
||||
# Add chunk text
|
||||
parts.append(chunk_text)
|
||||
|
||||
# Add chunk end marker
|
||||
parts.append("\n\n🔍 **MATCHED CHUNK END**\n\n")
|
||||
|
||||
# Add after context if present
|
||||
if after_context:
|
||||
parts.append(after_context)
|
||||
|
||||
# Add truncation indicator for after context
|
||||
if has_after_truncation:
|
||||
parts.append("\n\n**[...]**")
|
||||
|
||||
return "".join(parts)
|
||||
@@ -50,6 +50,9 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
|
||||
Returns unverified results from Qdrant. Access verification should be
|
||||
performed separately at the final output stage using verify_search_results().
|
||||
|
||||
Deduplicates by (doc_id, doc_type, chunk_start_offset, chunk_end_offset)
|
||||
to show multiple chunks from the same document while avoiding duplicate chunks.
|
||||
|
||||
Args:
|
||||
query: Natural language search query
|
||||
user_id: User ID for filtering
|
||||
@@ -123,21 +126,24 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
|
||||
top_scores = [p.score for p in search_response.points[:3]]
|
||||
logger.debug(f"Top 3 similarity scores: {top_scores}")
|
||||
|
||||
# Deduplicate by (doc_id, doc_type) - multiple chunks per document
|
||||
seen_docs = set()
|
||||
# Deduplicate by (doc_id, doc_type, chunk_start, chunk_end)
|
||||
# This allows multiple chunks from same doc, but removes duplicate chunks
|
||||
seen_chunks = set()
|
||||
results = []
|
||||
|
||||
for result in search_response.points:
|
||||
# doc_id can be int (notes) or str (files - file paths)
|
||||
doc_id = result.payload["doc_id"]
|
||||
doc_type = result.payload.get("doc_type", "note")
|
||||
doc_key = (doc_id, doc_type)
|
||||
chunk_start = result.payload.get("chunk_start_offset")
|
||||
chunk_end = result.payload.get("chunk_end_offset")
|
||||
chunk_key = (doc_id, doc_type, chunk_start, chunk_end)
|
||||
|
||||
# Skip if we've already seen this document
|
||||
if doc_key in seen_docs:
|
||||
# Skip if we've already seen this exact chunk
|
||||
if chunk_key in seen_chunks:
|
||||
continue
|
||||
|
||||
seen_docs.add(doc_key)
|
||||
seen_chunks.add(chunk_key)
|
||||
|
||||
# Return unverified results (verification happens at output stage)
|
||||
results.append(
|
||||
@@ -153,6 +159,9 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
|
||||
},
|
||||
chunk_start_offset=result.payload.get("chunk_start_offset"),
|
||||
chunk_end_offset=result.payload.get("chunk_end_offset"),
|
||||
page_number=result.payload.get("page_number"),
|
||||
chunk_index=result.payload.get("chunk_index", 0),
|
||||
total_chunks=result.payload.get("total_chunks", 1),
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user