85db90a2df
Changes: - Add file_path to metadata in semantic and BM25 hybrid search algorithms for PDF viewer integration (search/semantic.py:161-163, search/bm25_hybrid.py:230-232) - Include chunk_start_offset, chunk_end_offset, page_number, and page_count in search results for rich chunk display (api/management.py:981-1004) - Add point_id field to SearchResult for batch retrieval (models/semantic.py) - Fix type narrowing for chunk context API parameters (api/management.py:1102-1111) - Fix None-safety in doc_types discovery (search/algorithms.py:114) This enables the Astroglobe UI to display PDF pages at the correct location for matched chunks. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
149 lines
5.6 KiB
Python
149 lines
5.6 KiB
Python
"""Pydantic models for semantic search responses."""
|
|
|
|
from typing import List, Optional
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from .base import BaseResponse
|
|
|
|
|
|
class SemanticSearchResult(BaseModel):
|
|
"""Model for semantic search results with additional metadata."""
|
|
|
|
id: int = Field(description="Document ID (int for all document types)")
|
|
doc_type: str = Field(
|
|
description="Document type (note, calendar_event, deck_card, etc.)"
|
|
)
|
|
title: str = Field(description="Document title")
|
|
category: str = Field(
|
|
default="", description="Document category (notes) or location (calendar)"
|
|
)
|
|
excerpt: str = Field(description="Excerpt from matching chunk")
|
|
score: float = Field(
|
|
description=(
|
|
"Relevance score (≥ 0.0, higher is better). "
|
|
"Score range depends on fusion method: "
|
|
"RRF produces scores in [0.0, 1.0], "
|
|
"DBSF can exceed 1.0 (sum of normalized scores from multiple systems)"
|
|
)
|
|
)
|
|
chunk_index: int = Field(description="Index of matching chunk in document")
|
|
total_chunks: int = Field(description="Total number of chunks in document")
|
|
chunk_start_offset: Optional[int] = Field(
|
|
default=None, description="Character position where chunk starts in document"
|
|
)
|
|
chunk_end_offset: Optional[int] = Field(
|
|
default=None, description="Character position where chunk ends in document"
|
|
)
|
|
page_number: Optional[int] = Field(
|
|
default=None, description="Page number for PDF documents"
|
|
)
|
|
page_count: Optional[int] = Field(
|
|
default=None, description="Total number of pages in PDF document"
|
|
)
|
|
# Context expansion fields (optional, populated when include_context=True)
|
|
has_context_expansion: bool = Field(
|
|
default=False, description="Whether context expansion was performed"
|
|
)
|
|
marked_text: Optional[str] = Field(
|
|
default=None,
|
|
description="Full text with position markers around matched chunk",
|
|
)
|
|
before_context: Optional[str] = Field(
|
|
default=None, description="Text before the matched chunk"
|
|
)
|
|
after_context: Optional[str] = Field(
|
|
default=None, description="Text after the matched chunk"
|
|
)
|
|
has_before_truncation: Optional[bool] = Field(
|
|
default=None, description="Whether before_context was truncated"
|
|
)
|
|
has_after_truncation: Optional[bool] = Field(
|
|
default=None, description="Whether after_context was truncated"
|
|
)
|
|
|
|
|
|
class SemanticSearchResponse(BaseResponse):
|
|
"""Response model for semantic search across all indexed Nextcloud apps."""
|
|
|
|
results: List[SemanticSearchResult] = Field(
|
|
description="Semantic search results with similarity scores"
|
|
)
|
|
query: str = Field(description="The search query used")
|
|
total_found: int = Field(description="Total number of documents found")
|
|
search_method: str = Field(
|
|
default="semantic", description="Search method used (semantic or hybrid)"
|
|
)
|
|
|
|
|
|
class SamplingSearchResponse(BaseResponse):
|
|
"""Response from semantic search with LLM-generated answer via MCP sampling.
|
|
|
|
This response includes both a generated natural language answer (created by
|
|
the MCP client's LLM via sampling) and the source documents used to generate
|
|
that answer. Users can read the answer for quick information and review
|
|
sources for verification and deeper exploration.
|
|
|
|
Attributes:
|
|
query: The original user query
|
|
generated_answer: Natural language answer generated by client's LLM
|
|
sources: List of semantic search results used as context
|
|
total_found: Total number of matching documents found
|
|
search_method: Always "semantic_sampling" for this response type
|
|
model_used: Name of model that generated the answer (e.g., "claude-3-5-sonnet")
|
|
stop_reason: Why generation stopped ("endTurn", "maxTokens", etc.)
|
|
"""
|
|
|
|
query: str = Field(..., description="Original user query")
|
|
generated_answer: str = Field(
|
|
..., description="LLM-generated answer based on retrieved documents"
|
|
)
|
|
sources: List[SemanticSearchResult] = Field(
|
|
default_factory=list,
|
|
description="Source documents with excerpts and relevance scores",
|
|
)
|
|
total_found: int = Field(..., description="Total matching documents")
|
|
search_method: str = Field(
|
|
default="semantic_sampling", description="Search method used"
|
|
)
|
|
model_used: Optional[str] = Field(
|
|
default=None, description="Model that generated the answer"
|
|
)
|
|
stop_reason: Optional[str] = Field(
|
|
default=None, description="Reason generation stopped"
|
|
)
|
|
|
|
|
|
class VectorSyncStatusResponse(BaseResponse):
|
|
"""Response for vector sync status.
|
|
|
|
Provides information about the current state of vector sync,
|
|
including how many documents are indexed and how many are pending.
|
|
|
|
Attributes:
|
|
indexed_count: Number of documents in Qdrant vector database
|
|
pending_count: Number of documents in processing queue
|
|
status: Current sync status ("idle" or "syncing")
|
|
enabled: Whether vector sync is enabled
|
|
"""
|
|
|
|
indexed_count: int = Field(
|
|
default=0, description="Number of documents indexed in vector database"
|
|
)
|
|
pending_count: int = Field(
|
|
default=0, description="Number of documents pending processing"
|
|
)
|
|
status: str = Field(
|
|
default="disabled",
|
|
description='Sync status: "idle", "syncing", or "disabled"',
|
|
)
|
|
enabled: bool = Field(default=False, description="Whether vector sync is enabled")
|
|
|
|
|
|
__all__ = [
|
|
"SemanticSearchResult",
|
|
"SemanticSearchResponse",
|
|
"SamplingSearchResponse",
|
|
"VectorSyncStatusResponse",
|
|
]
|