feat(search): add file_path metadata and chunk offsets to search results
Changes: - Add file_path to metadata in semantic and BM25 hybrid search algorithms for PDF viewer integration (search/semantic.py:161-163, search/bm25_hybrid.py:230-232) - Include chunk_start_offset, chunk_end_offset, page_number, and page_count in search results for rich chunk display (api/management.py:981-1004) - Add point_id field to SearchResult for batch retrieval (models/semantic.py) - Fix type narrowing for chunk context API parameters (api/management.py:1102-1111) - Fix None-safety in doc_types discovery (search/algorithms.py:114) This enables the Astroglobe UI to display PDF pages at the correct location for matched chunks. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -393,6 +393,275 @@ async def revoke_user_access(request: Request) -> JSONResponse:
|
||||
)
|
||||
|
||||
|
||||
async def get_installed_apps(request: Request) -> JSONResponse:
|
||||
"""GET /api/v1/apps - Get list of installed Nextcloud apps.
|
||||
|
||||
Returns a list of installed app IDs for filtering webhook presets.
|
||||
|
||||
Requires OAuth bearer token for authentication.
|
||||
"""
|
||||
try:
|
||||
# Validate OAuth token and extract user
|
||||
user_id, validated = await validate_token_and_get_user(request)
|
||||
except Exception as e:
|
||||
logger.warning(f"Unauthorized access to /api/v1/apps: {e}")
|
||||
return JSONResponse(
|
||||
{"error": "Unauthorized", "message": str(e)},
|
||||
status_code=401,
|
||||
)
|
||||
|
||||
try:
|
||||
import httpx
|
||||
|
||||
# Get Bearer token from request
|
||||
token = extract_bearer_token(request)
|
||||
if not token:
|
||||
raise ValueError("Missing Authorization header")
|
||||
|
||||
# Get Nextcloud host from OAuth context
|
||||
oauth_ctx = request.app.state.oauth_context
|
||||
nextcloud_host = oauth_ctx.get("config", {}).get("nextcloud_host", "")
|
||||
|
||||
if not nextcloud_host:
|
||||
raise ValueError("Nextcloud host not configured")
|
||||
|
||||
# Create authenticated HTTP client
|
||||
async with httpx.AsyncClient(
|
||||
base_url=nextcloud_host,
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
timeout=30.0,
|
||||
) as client:
|
||||
# Get installed apps using OCS API
|
||||
# Notes, Calendar, Deck, Tables, etc. are apps that support webhooks
|
||||
# We check which ones are installed and enabled
|
||||
ocs_url = "/ocs/v1.php/cloud/apps"
|
||||
params = {"filter": "enabled"}
|
||||
|
||||
response = await client.get(
|
||||
ocs_url,
|
||||
params=params,
|
||||
headers={"OCS-APIRequest": "true", "Accept": "application/json"},
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise ValueError(f"OCS API returned status {response.status_code}")
|
||||
|
||||
data = response.json()
|
||||
apps = data.get("ocs", {}).get("data", {}).get("apps", [])
|
||||
|
||||
return JSONResponse({"apps": apps})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting installed apps for user {user_id}: {e}")
|
||||
return JSONResponse(
|
||||
{"error": "Internal error", "message": str(e)},
|
||||
status_code=500,
|
||||
)
|
||||
|
||||
|
||||
async def list_webhooks(request: Request) -> JSONResponse:
|
||||
"""GET /api/v1/webhooks - List all registered webhooks.
|
||||
|
||||
Returns list of webhook registrations for the authenticated user.
|
||||
|
||||
Requires OAuth bearer token for authentication.
|
||||
"""
|
||||
try:
|
||||
# Validate OAuth token and extract user
|
||||
user_id, validated = await validate_token_and_get_user(request)
|
||||
except Exception as e:
|
||||
logger.warning(f"Unauthorized access to /api/v1/webhooks: {e}")
|
||||
return JSONResponse(
|
||||
{"error": "Unauthorized", "message": str(e)},
|
||||
status_code=401,
|
||||
)
|
||||
|
||||
try:
|
||||
import httpx
|
||||
|
||||
from nextcloud_mcp_server.client.webhooks import WebhooksClient
|
||||
|
||||
# Get Bearer token from request
|
||||
token = extract_bearer_token(request)
|
||||
if not token:
|
||||
raise ValueError("Missing Authorization header")
|
||||
|
||||
# Get Nextcloud host from OAuth context
|
||||
oauth_ctx = request.app.state.oauth_context
|
||||
nextcloud_host = oauth_ctx.get("config", {}).get("nextcloud_host", "")
|
||||
|
||||
if not nextcloud_host:
|
||||
raise ValueError("Nextcloud host not configured")
|
||||
|
||||
# Create authenticated HTTP client
|
||||
async with httpx.AsyncClient(
|
||||
base_url=nextcloud_host,
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
timeout=30.0,
|
||||
) as client:
|
||||
# Use WebhooksClient to list webhooks
|
||||
webhooks_client = WebhooksClient(client, user_id)
|
||||
webhooks = await webhooks_client.list_webhooks()
|
||||
|
||||
return JSONResponse({"webhooks": webhooks})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error listing webhooks for user {user_id}: {e}")
|
||||
return JSONResponse(
|
||||
{"error": "Internal error", "message": str(e)},
|
||||
status_code=500,
|
||||
)
|
||||
|
||||
|
||||
async def create_webhook(request: Request) -> JSONResponse:
|
||||
"""POST /api/v1/webhooks - Create a new webhook registration.
|
||||
|
||||
Request body:
|
||||
{
|
||||
"event": "OCP\\Files\\Events\\Node\\NodeCreatedEvent",
|
||||
"uri": "http://mcp:8000/webhooks/nextcloud",
|
||||
"eventFilter": {"event.node.path": "/^\\/.*\\/files\\/Notes\\//"}
|
||||
}
|
||||
|
||||
Returns the created webhook data including the webhook ID.
|
||||
|
||||
Requires OAuth bearer token for authentication.
|
||||
"""
|
||||
try:
|
||||
# Validate OAuth token and extract user
|
||||
user_id, validated = await validate_token_and_get_user(request)
|
||||
except Exception as e:
|
||||
logger.warning(f"Unauthorized access to /api/v1/webhooks: {e}")
|
||||
return JSONResponse(
|
||||
{"error": "Unauthorized", "message": str(e)},
|
||||
status_code=401,
|
||||
)
|
||||
|
||||
try:
|
||||
import httpx
|
||||
|
||||
from nextcloud_mcp_server.client.webhooks import WebhooksClient
|
||||
|
||||
# Parse request body
|
||||
body = await request.json()
|
||||
event = body.get("event")
|
||||
uri = body.get("uri")
|
||||
# Accept both camelCase (eventFilter) and snake_case (event_filter)
|
||||
event_filter = body.get("eventFilter") or body.get("event_filter")
|
||||
|
||||
if not event or not uri:
|
||||
return JSONResponse(
|
||||
{
|
||||
"error": "Bad request",
|
||||
"message": "Missing required fields: event, uri",
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
# Get Bearer token from request
|
||||
token = extract_bearer_token(request)
|
||||
if not token:
|
||||
raise ValueError("Missing Authorization header")
|
||||
|
||||
# Get Nextcloud host from OAuth context
|
||||
oauth_ctx = request.app.state.oauth_context
|
||||
nextcloud_host = oauth_ctx.get("config", {}).get("nextcloud_host", "")
|
||||
|
||||
if not nextcloud_host:
|
||||
raise ValueError("Nextcloud host not configured")
|
||||
|
||||
# Create authenticated HTTP client
|
||||
async with httpx.AsyncClient(
|
||||
base_url=nextcloud_host,
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
timeout=30.0,
|
||||
) as client:
|
||||
# Use WebhooksClient to create webhook
|
||||
webhooks_client = WebhooksClient(client, user_id)
|
||||
webhook_data = await webhooks_client.create_webhook(
|
||||
event=event, uri=uri, event_filter=event_filter
|
||||
)
|
||||
|
||||
return JSONResponse({"webhook": webhook_data})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error creating webhook for user {user_id}: {e}")
|
||||
return JSONResponse(
|
||||
{"error": "Internal error", "message": str(e)},
|
||||
status_code=500,
|
||||
)
|
||||
|
||||
|
||||
async def delete_webhook(request: Request) -> JSONResponse:
|
||||
"""DELETE /api/v1/webhooks/{webhook_id} - Delete a webhook registration.
|
||||
|
||||
Returns success/failure status.
|
||||
|
||||
Requires OAuth bearer token for authentication.
|
||||
"""
|
||||
try:
|
||||
# Validate OAuth token and extract user
|
||||
user_id, validated = await validate_token_and_get_user(request)
|
||||
except Exception as e:
|
||||
logger.warning(f"Unauthorized access to /api/v1/webhooks: {e}")
|
||||
return JSONResponse(
|
||||
{"error": "Unauthorized", "message": str(e)},
|
||||
status_code=401,
|
||||
)
|
||||
|
||||
try:
|
||||
import httpx
|
||||
|
||||
from nextcloud_mcp_server.client.webhooks import WebhooksClient
|
||||
|
||||
# Get webhook_id from path parameter
|
||||
webhook_id = request.path_params.get("webhook_id")
|
||||
if not webhook_id:
|
||||
return JSONResponse(
|
||||
{"error": "Bad request", "message": "Missing webhook_id"},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
try:
|
||||
webhook_id = int(webhook_id)
|
||||
except ValueError:
|
||||
return JSONResponse(
|
||||
{"error": "Bad request", "message": "Invalid webhook_id"},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
# Get Bearer token from request
|
||||
token = extract_bearer_token(request)
|
||||
if not token:
|
||||
raise ValueError("Missing Authorization header")
|
||||
|
||||
# Get Nextcloud host from OAuth context
|
||||
oauth_ctx = request.app.state.oauth_context
|
||||
nextcloud_host = oauth_ctx.get("config", {}).get("nextcloud_host", "")
|
||||
|
||||
if not nextcloud_host:
|
||||
raise ValueError("Nextcloud host not configured")
|
||||
|
||||
# Create authenticated HTTP client
|
||||
async with httpx.AsyncClient(
|
||||
base_url=nextcloud_host,
|
||||
headers={"Authorization": f"Bearer {token}"},
|
||||
timeout=30.0,
|
||||
) as client:
|
||||
# Use WebhooksClient to delete webhook
|
||||
webhooks_client = WebhooksClient(client, user_id)
|
||||
await webhooks_client.delete_webhook(webhook_id=webhook_id)
|
||||
|
||||
return JSONResponse({"success": True, "message": "Webhook deleted"})
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error deleting webhook for user {user_id}: {e}")
|
||||
return JSONResponse(
|
||||
{"error": "Internal error", "message": str(e)},
|
||||
status_code=500,
|
||||
)
|
||||
|
||||
|
||||
async def unified_search(request: Request) -> JSONResponse:
|
||||
"""POST /api/v1/search - Search endpoint for Nextcloud Unified Search.
|
||||
|
||||
@@ -513,29 +782,12 @@ async def unified_search(request: Request) -> JSONResponse:
|
||||
limit=search_limit,
|
||||
)
|
||||
|
||||
# Deduplicate results by document (multiple chunks may come from same doc)
|
||||
# Keep highest-scoring chunk per document
|
||||
doc_map: dict[str, Any] = {} # key: "doc_type:id" -> best result
|
||||
for result in all_results:
|
||||
# Build document key from type and ID
|
||||
doc_id = result.id
|
||||
if result.metadata:
|
||||
# Use note_id if present (for notes), otherwise use result.id
|
||||
doc_id = result.metadata.get("note_id", result.id)
|
||||
doc_key = f"{result.doc_type}:{doc_id}"
|
||||
# Sort results by score (no deduplication - show all chunks)
|
||||
sorted_results = sorted(all_results, key=lambda r: r.score, reverse=True)
|
||||
|
||||
# Keep only the highest-scoring chunk per document
|
||||
if doc_key not in doc_map or result.score > doc_map[doc_key].score:
|
||||
doc_map[doc_key] = result
|
||||
|
||||
# Convert back to list and sort by score
|
||||
deduplicated_results = sorted(
|
||||
doc_map.values(), key=lambda r: r.score, reverse=True
|
||||
)
|
||||
|
||||
# Calculate total and apply pagination (on deduplicated results)
|
||||
total_found = len(deduplicated_results)
|
||||
paginated_results = deduplicated_results[offset : offset + limit]
|
||||
# Calculate total and apply pagination
|
||||
total_found = len(sorted_results)
|
||||
paginated_results = sorted_results[offset : offset + limit]
|
||||
|
||||
# Format results for Unified Search
|
||||
formatted_results = []
|
||||
@@ -576,6 +828,16 @@ async def unified_search(request: Request) -> JSONResponse:
|
||||
if "event_uid" in result.metadata:
|
||||
result_data["event_uid"] = result.metadata["event_uid"]
|
||||
|
||||
# Add PDF page metadata
|
||||
if result.page_number is not None:
|
||||
result_data["page_number"] = result.page_number
|
||||
if result.page_count is not None:
|
||||
result_data["page_count"] = result.page_count
|
||||
|
||||
# Add chunk metadata (always present, defaults to 0 and 1)
|
||||
result_data["chunk_index"] = result.chunk_index
|
||||
result_data["total_chunks"] = result.total_chunks
|
||||
|
||||
formatted_results.append(result_data)
|
||||
|
||||
response_data: dict[str, Any] = {
|
||||
@@ -659,6 +921,8 @@ async def vector_search(request: Request) -> JSONResponse:
|
||||
body = await request.json()
|
||||
query = body.get("query", "")
|
||||
algorithm = body.get("algorithm", "hybrid")
|
||||
fusion = body.get("fusion", "rrf")
|
||||
score_threshold = body.get("score_threshold", 0.0)
|
||||
limit = min(body.get("limit", 10), 50) # Enforce max limit
|
||||
include_pca = body.get("include_pca", True)
|
||||
doc_types = body.get("doc_types") # Optional list of document types
|
||||
@@ -674,6 +938,11 @@ async def vector_search(request: Request) -> JSONResponse:
|
||||
if algorithm not in valid_algorithms:
|
||||
algorithm = "hybrid"
|
||||
|
||||
# Validate fusion method
|
||||
valid_fusions = {"rrf", "dbsf"}
|
||||
if fusion not in valid_fusions:
|
||||
fusion = "rrf"
|
||||
|
||||
# Execute search using the appropriate algorithm
|
||||
from nextcloud_mcp_server.search import (
|
||||
BM25HybridSearchAlgorithm,
|
||||
@@ -682,11 +951,13 @@ async def vector_search(request: Request) -> JSONResponse:
|
||||
|
||||
# Select search algorithm
|
||||
if algorithm == "semantic":
|
||||
search_algo = SemanticSearchAlgorithm(score_threshold=0.0)
|
||||
search_algo = SemanticSearchAlgorithm(score_threshold=score_threshold)
|
||||
else:
|
||||
# Both "hybrid" and "bm25" use the BM25HybridSearchAlgorithm
|
||||
# which combines dense semantic and sparse BM25 vectors
|
||||
search_algo = BM25HybridSearchAlgorithm(score_threshold=0.0, fusion="rrf")
|
||||
search_algo = BM25HybridSearchAlgorithm(
|
||||
score_threshold=score_threshold, fusion=fusion
|
||||
)
|
||||
|
||||
# Execute search for each doc_type if specified, otherwise search all
|
||||
all_results = []
|
||||
@@ -715,16 +986,27 @@ async def vector_search(request: Request) -> JSONResponse:
|
||||
# Format results for PHP client
|
||||
formatted_results = []
|
||||
for result in all_results:
|
||||
formatted_results.append(
|
||||
{
|
||||
"id": result.id,
|
||||
"doc_type": result.doc_type,
|
||||
"title": result.title,
|
||||
"excerpt": result.excerpt[:200] if result.excerpt else "",
|
||||
"score": result.score,
|
||||
"metadata": result.metadata,
|
||||
}
|
||||
)
|
||||
formatted_result = {
|
||||
"id": result.id,
|
||||
"doc_type": result.doc_type,
|
||||
"title": result.title,
|
||||
"excerpt": result.excerpt[:200] if result.excerpt else "",
|
||||
"score": result.score,
|
||||
"metadata": result.metadata,
|
||||
# Chunk information for context display
|
||||
"chunk_index": result.chunk_index,
|
||||
"total_chunks": result.total_chunks,
|
||||
}
|
||||
# Include optional fields if present
|
||||
if result.chunk_start_offset is not None:
|
||||
formatted_result["chunk_start_offset"] = result.chunk_start_offset
|
||||
if result.chunk_end_offset is not None:
|
||||
formatted_result["chunk_end_offset"] = result.chunk_end_offset
|
||||
if result.page_number is not None:
|
||||
formatted_result["page_number"] = result.page_number
|
||||
if result.page_count is not None:
|
||||
formatted_result["page_count"] = result.page_count
|
||||
formatted_results.append(formatted_result)
|
||||
|
||||
response_data: dict[str, Any] = {
|
||||
"results": formatted_results,
|
||||
@@ -772,3 +1054,175 @@ async def vector_search(request: Request) -> JSONResponse:
|
||||
{"error": "Internal error", "message": str(e)},
|
||||
status_code=500,
|
||||
)
|
||||
|
||||
|
||||
async def get_chunk_context(request: Request) -> JSONResponse:
|
||||
"""GET /api/v1/chunk-context - Fetch chunk text with context.
|
||||
|
||||
Retrieves the matched chunk along with surrounding text and metadata.
|
||||
Used by clients to display chunk context and highlighted PDFs.
|
||||
|
||||
Query parameters:
|
||||
doc_type: Document type (e.g., "note")
|
||||
doc_id: Document ID
|
||||
start: Chunk start offset (character position)
|
||||
end: Chunk end offset (character position)
|
||||
context: Characters of context before/after (default: 500)
|
||||
|
||||
Requires OAuth bearer token for authentication.
|
||||
"""
|
||||
try:
|
||||
# Validate OAuth token and extract user
|
||||
user_id, validated = await validate_token_and_get_user(request)
|
||||
except Exception as e:
|
||||
logger.warning(f"Unauthorized access to /api/v1/chunk-context: {e}")
|
||||
return JSONResponse(
|
||||
{"error": "Unauthorized", "message": str(e)},
|
||||
status_code=401,
|
||||
)
|
||||
|
||||
try:
|
||||
# Get query parameters
|
||||
doc_type = request.query_params.get("doc_type")
|
||||
doc_id = request.query_params.get("doc_id")
|
||||
start_str = request.query_params.get("start")
|
||||
end_str = request.query_params.get("end")
|
||||
context_chars = int(request.query_params.get("context", "500"))
|
||||
|
||||
# Validate required parameters
|
||||
if not all([doc_type, doc_id, start_str, end_str]):
|
||||
return JSONResponse(
|
||||
{
|
||||
"success": False,
|
||||
"error": "Missing required parameters: doc_type, doc_id, start, end",
|
||||
},
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
# Type narrowing: we already checked these are not None above
|
||||
assert start_str is not None
|
||||
assert end_str is not None
|
||||
assert doc_id is not None
|
||||
assert doc_type is not None
|
||||
|
||||
start = int(start_str)
|
||||
end = int(end_str)
|
||||
# Convert doc_id to int if possible (most IDs are int)
|
||||
doc_id_val: str | int = int(doc_id) if doc_id.isdigit() else doc_id
|
||||
|
||||
# Get bearer token for client initialization
|
||||
token = extract_bearer_token(request)
|
||||
if not token:
|
||||
raise ValueError("Missing token")
|
||||
|
||||
# Get Nextcloud host from OAuth context
|
||||
oauth_ctx = request.app.state.oauth_context
|
||||
nextcloud_host = oauth_ctx.get("config", {}).get("nextcloud_host", "")
|
||||
|
||||
if not nextcloud_host:
|
||||
raise ValueError("Nextcloud host not configured")
|
||||
|
||||
# Initialize authenticated Nextcloud client
|
||||
from nextcloud_mcp_server.client import NextcloudClient
|
||||
from nextcloud_mcp_server.search.context import get_chunk_with_context
|
||||
|
||||
async with NextcloudClient.from_token(
|
||||
base_url=nextcloud_host, token=token, username=user_id
|
||||
) as nc_client:
|
||||
chunk_context = await get_chunk_with_context(
|
||||
nc_client=nc_client,
|
||||
user_id=user_id,
|
||||
doc_id=doc_id_val,
|
||||
doc_type=doc_type,
|
||||
chunk_start=start,
|
||||
chunk_end=end,
|
||||
context_chars=context_chars,
|
||||
)
|
||||
|
||||
if chunk_context is None:
|
||||
return JSONResponse(
|
||||
{
|
||||
"success": False,
|
||||
"error": f"Failed to fetch chunk context for {doc_type} {doc_id}",
|
||||
},
|
||||
status_code=404,
|
||||
)
|
||||
|
||||
# For PDF files, also fetch the highlighted page image from Qdrant if available
|
||||
# This is useful for clients that want to show a pre-rendered image
|
||||
highlighted_page_image = None
|
||||
page_number = chunk_context.page_number
|
||||
|
||||
if doc_type == "file":
|
||||
try:
|
||||
from qdrant_client.models import FieldCondition, Filter, MatchValue
|
||||
|
||||
from nextcloud_mcp_server.config import get_settings
|
||||
from nextcloud_mcp_server.vector.placeholder import (
|
||||
get_placeholder_filter,
|
||||
)
|
||||
from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client
|
||||
|
||||
settings = get_settings()
|
||||
qdrant_client = await get_qdrant_client()
|
||||
|
||||
# Query for this specific chunk's highlighted image
|
||||
points_response = await qdrant_client.scroll(
|
||||
collection_name=settings.get_collection_name(),
|
||||
scroll_filter=Filter(
|
||||
must=[
|
||||
get_placeholder_filter(),
|
||||
FieldCondition(
|
||||
key="doc_id", match=MatchValue(value=doc_id_val)
|
||||
),
|
||||
FieldCondition(
|
||||
key="user_id", match=MatchValue(value=user_id)
|
||||
),
|
||||
FieldCondition(
|
||||
key="chunk_start_offset", match=MatchValue(value=start)
|
||||
),
|
||||
FieldCondition(
|
||||
key="chunk_end_offset", match=MatchValue(value=end)
|
||||
),
|
||||
]
|
||||
),
|
||||
limit=1,
|
||||
with_vectors=False,
|
||||
with_payload=["highlighted_page_image", "page_number"],
|
||||
)
|
||||
|
||||
if points_response[0]:
|
||||
payload = points_response[0][0].payload
|
||||
if payload:
|
||||
highlighted_page_image = payload.get("highlighted_page_image")
|
||||
# Trust Qdrant page number if available (might be more accurate than context expansion logic)
|
||||
if payload.get("page_number") is not None:
|
||||
page_number = payload.get("page_number")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to fetch highlighted image: {e}")
|
||||
|
||||
# Build response
|
||||
response_data = {
|
||||
"success": True,
|
||||
"chunk_text": chunk_context.chunk_text,
|
||||
"before_context": chunk_context.before_context,
|
||||
"after_context": chunk_context.after_context,
|
||||
"has_more_before": chunk_context.has_before_truncation,
|
||||
"has_more_after": chunk_context.has_after_truncation,
|
||||
"page_number": page_number,
|
||||
"chunk_index": chunk_context.chunk_index,
|
||||
"total_chunks": chunk_context.total_chunks,
|
||||
}
|
||||
|
||||
if highlighted_page_image:
|
||||
response_data["highlighted_page_image"] = highlighted_page_image
|
||||
|
||||
return JSONResponse(response_data)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error getting chunk context: {e}", exc_info=True)
|
||||
return JSONResponse(
|
||||
{"error": "Internal error", "message": str(e)},
|
||||
status_code=500,
|
||||
)
|
||||
|
||||
@@ -38,6 +38,9 @@ class SemanticSearchResult(BaseModel):
|
||||
page_number: Optional[int] = Field(
|
||||
default=None, description="Page number for PDF documents"
|
||||
)
|
||||
page_count: Optional[int] = Field(
|
||||
default=None, description="Total number of pages in PDF document"
|
||||
)
|
||||
# Context expansion fields (optional, populated when include_context=True)
|
||||
has_context_expansion: bool = Field(
|
||||
default=False, description="Whether context expansion was performed"
|
||||
|
||||
@@ -111,7 +111,7 @@ async def get_indexed_doc_types(user_id: str) -> set[str]:
|
||||
doc_types: set[str] = {
|
||||
str(point.payload.get("doc_type"))
|
||||
for point in scroll_results
|
||||
if point.payload.get("doc_type")
|
||||
if point.payload and point.payload.get("doc_type")
|
||||
}
|
||||
|
||||
logger.debug(f"Found indexed document types for user {user_id}: {doc_types}")
|
||||
@@ -138,6 +138,7 @@ class SearchResult:
|
||||
chunk_start_offset: Character position where chunk starts (None if not available)
|
||||
chunk_end_offset: Character position where chunk ends (None if not available)
|
||||
page_number: Page number for PDF documents (None for other doc types)
|
||||
page_count: Total number of pages in PDF document (None for other doc types)
|
||||
chunk_index: Zero-based index of this chunk in the document
|
||||
total_chunks: Total number of chunks in the document
|
||||
point_id: Qdrant point ID for batch vector retrieval (None if not from Qdrant)
|
||||
@@ -152,6 +153,7 @@ class SearchResult:
|
||||
chunk_start_offset: int | None = None
|
||||
chunk_end_offset: int | None = None
|
||||
page_number: int | None = None
|
||||
page_count: int | None = None
|
||||
chunk_index: int = 0
|
||||
total_chunks: int = 1
|
||||
point_id: str | None = None
|
||||
|
||||
@@ -226,6 +226,10 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
|
||||
"search_method": f"bm25_hybrid_{self.fusion_name}",
|
||||
}
|
||||
|
||||
# Add file-specific metadata for PDF viewer
|
||||
if doc_type == "file" and (path := result.payload.get("file_path")):
|
||||
metadata["path"] = path
|
||||
|
||||
# Add deck_card-specific metadata for frontend URL construction
|
||||
if doc_type == "deck_card":
|
||||
if board_id := result.payload.get("board_id"):
|
||||
@@ -243,6 +247,7 @@ class BM25HybridSearchAlgorithm(SearchAlgorithm):
|
||||
chunk_start_offset=result.payload.get("chunk_start_offset"),
|
||||
chunk_end_offset=result.payload.get("chunk_end_offset"),
|
||||
page_number=result.payload.get("page_number"),
|
||||
page_count=result.payload.get("page_count"),
|
||||
chunk_index=result.payload.get("chunk_index", 0),
|
||||
total_chunks=result.payload.get("total_chunks", 1),
|
||||
point_id=str(result.id), # Qdrant point ID for batch retrieval
|
||||
|
||||
@@ -157,6 +157,10 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
|
||||
"total_chunks": result.payload.get("total_chunks"),
|
||||
}
|
||||
|
||||
# Add file-specific metadata for PDF viewer
|
||||
if doc_type == "file" and (path := result.payload.get("file_path")):
|
||||
metadata["path"] = path
|
||||
|
||||
# Add deck_card-specific metadata for frontend URL construction
|
||||
if doc_type == "deck_card":
|
||||
if board_id := result.payload.get("board_id"):
|
||||
@@ -174,6 +178,7 @@ class SemanticSearchAlgorithm(SearchAlgorithm):
|
||||
chunk_start_offset=result.payload.get("chunk_start_offset"),
|
||||
chunk_end_offset=result.payload.get("chunk_end_offset"),
|
||||
page_number=result.payload.get("page_number"),
|
||||
page_count=result.payload.get("page_count"),
|
||||
chunk_index=result.payload.get("chunk_index", 0),
|
||||
total_chunks=result.payload.get("total_chunks", 1),
|
||||
point_id=str(result.id), # Qdrant point ID for batch retrieval
|
||||
|
||||
Reference in New Issue
Block a user