3464b21845
Fix false-positive validation error where DBSF (Distribution-Based Score Fusion) correctly produces scores > 1.0 but SearchResult validation incorrectly rejected them. **Root Cause**: SearchResult.__post_init__() enforced scores in [0.0, 1.0] range, but DBSF sums normalized scores from multiple retrieval systems (dense semantic + sparse BM25), resulting in scores like 1.55 when both systems strongly agree a document is relevant. **Changes**: - Relaxed validation to allow any score ≥ 0.0 (algorithms.py:147-157) - Updated SearchResult and SemanticSearchResult documentation to explain score ranges for RRF ([0.0, 1.0]) vs DBSF (unbounded) - Added comprehensive test coverage for both fusion methods - Added DBSF fusion option to vector visualization UI - Updated viz routes and vizApp() to support fusion parameter selection **Testing**: All 157 unit tests pass, type checking passes, ruff passes Fixes error: "Configuration error: Score must be between 0.0 and 1.0, got 1.1528953" 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
123 lines
4.5 KiB
Python
123 lines
4.5 KiB
Python
"""Pydantic models for semantic search responses."""
|
|
|
|
from typing import List, Optional
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from .base import BaseResponse
|
|
|
|
|
|
class SemanticSearchResult(BaseModel):
|
|
"""Model for semantic search results with additional metadata."""
|
|
|
|
id: int = Field(description="Document ID")
|
|
doc_type: str = Field(
|
|
description="Document type (note, calendar_event, deck_card, etc.)"
|
|
)
|
|
title: str = Field(description="Document title")
|
|
category: str = Field(
|
|
default="", description="Document category (notes) or location (calendar)"
|
|
)
|
|
excerpt: str = Field(description="Excerpt from matching chunk")
|
|
score: float = Field(
|
|
description=(
|
|
"Relevance score (≥ 0.0, higher is better). "
|
|
"Score range depends on fusion method: "
|
|
"RRF produces scores in [0.0, 1.0], "
|
|
"DBSF can exceed 1.0 (sum of normalized scores from multiple systems)"
|
|
)
|
|
)
|
|
chunk_index: int = Field(description="Index of matching chunk in document")
|
|
total_chunks: int = Field(description="Total number of chunks in document")
|
|
chunk_start_offset: Optional[int] = Field(
|
|
default=None, description="Character position where chunk starts in document"
|
|
)
|
|
chunk_end_offset: Optional[int] = Field(
|
|
default=None, description="Character position where chunk ends in document"
|
|
)
|
|
|
|
|
|
class SemanticSearchResponse(BaseResponse):
|
|
"""Response model for semantic search across all indexed Nextcloud apps."""
|
|
|
|
results: List[SemanticSearchResult] = Field(
|
|
description="Semantic search results with similarity scores"
|
|
)
|
|
query: str = Field(description="The search query used")
|
|
total_found: int = Field(description="Total number of documents found")
|
|
search_method: str = Field(
|
|
default="semantic", description="Search method used (semantic or hybrid)"
|
|
)
|
|
|
|
|
|
class SamplingSearchResponse(BaseResponse):
|
|
"""Response from semantic search with LLM-generated answer via MCP sampling.
|
|
|
|
This response includes both a generated natural language answer (created by
|
|
the MCP client's LLM via sampling) and the source documents used to generate
|
|
that answer. Users can read the answer for quick information and review
|
|
sources for verification and deeper exploration.
|
|
|
|
Attributes:
|
|
query: The original user query
|
|
generated_answer: Natural language answer generated by client's LLM
|
|
sources: List of semantic search results used as context
|
|
total_found: Total number of matching documents found
|
|
search_method: Always "semantic_sampling" for this response type
|
|
model_used: Name of model that generated the answer (e.g., "claude-3-5-sonnet")
|
|
stop_reason: Why generation stopped ("endTurn", "maxTokens", etc.)
|
|
"""
|
|
|
|
query: str = Field(..., description="Original user query")
|
|
generated_answer: str = Field(
|
|
..., description="LLM-generated answer based on retrieved documents"
|
|
)
|
|
sources: List[SemanticSearchResult] = Field(
|
|
default_factory=list,
|
|
description="Source documents with excerpts and relevance scores",
|
|
)
|
|
total_found: int = Field(..., description="Total matching documents")
|
|
search_method: str = Field(
|
|
default="semantic_sampling", description="Search method used"
|
|
)
|
|
model_used: Optional[str] = Field(
|
|
default=None, description="Model that generated the answer"
|
|
)
|
|
stop_reason: Optional[str] = Field(
|
|
default=None, description="Reason generation stopped"
|
|
)
|
|
|
|
|
|
class VectorSyncStatusResponse(BaseResponse):
|
|
"""Response for vector sync status.
|
|
|
|
Provides information about the current state of vector sync,
|
|
including how many documents are indexed and how many are pending.
|
|
|
|
Attributes:
|
|
indexed_count: Number of documents in Qdrant vector database
|
|
pending_count: Number of documents in processing queue
|
|
status: Current sync status ("idle" or "syncing")
|
|
enabled: Whether vector sync is enabled
|
|
"""
|
|
|
|
indexed_count: int = Field(
|
|
default=0, description="Number of documents indexed in vector database"
|
|
)
|
|
pending_count: int = Field(
|
|
default=0, description="Number of documents pending processing"
|
|
)
|
|
status: str = Field(
|
|
default="disabled",
|
|
description='Sync status: "idle", "syncing", or "disabled"',
|
|
)
|
|
enabled: bool = Field(default=False, description="Whether vector sync is enabled")
|
|
|
|
|
|
__all__ = [
|
|
"SemanticSearchResult",
|
|
"SemanticSearchResponse",
|
|
"SamplingSearchResponse",
|
|
"VectorSyncStatusResponse",
|
|
]
|