fdd82f59e2
Completes the ADR-007 implementation by adding user-facing semantic search functionality. Previous phases implemented scanner and processor for background indexing; this adds the query interface. Changes: - Add nc_notes_semantic_search MCP tool for natural language queries - Fix Qdrant point IDs to use UUIDs instead of strings (was causing 400 errors) - Reduce scan interval default from 1 hour to 5 minutes for faster updates - Add SemanticSearchResult and SemanticSearchNotesResponse models - Implement dual-phase authorization (Qdrant filter + Nextcloud API verification) The semantic search enables finding notes by meaning rather than exact keywords, using vector embeddings to understand query intent. Point ID fix resolves critical bug where all document indexing failed with "invalid point ID" errors. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
231 lines
7.9 KiB
Python
231 lines
7.9 KiB
Python
import logging.config
|
|
import os
|
|
from dataclasses import dataclass
|
|
from typing import Any, Optional
|
|
|
|
LOGGING_CONFIG = {
|
|
"version": 1,
|
|
"disable_existing_loggers": False,
|
|
"handlers": {
|
|
"default": {
|
|
"class": "logging.StreamHandler",
|
|
"formatter": "http",
|
|
},
|
|
},
|
|
"formatters": {
|
|
"http": {
|
|
"format": "%(levelname)s [%(asctime)s] %(name)s - %(message)s",
|
|
"datefmt": "%Y-%m-%d %H:%M:%S",
|
|
},
|
|
},
|
|
"loggers": {
|
|
"": {
|
|
"handlers": ["default"],
|
|
"level": "INFO",
|
|
},
|
|
"httpx": {
|
|
"handlers": ["default"],
|
|
"level": "INFO",
|
|
"propagate": False, # Prevent propagation to root logger
|
|
},
|
|
"httpcore": {
|
|
"handlers": ["default"],
|
|
"level": "INFO",
|
|
"propagate": False, # Prevent propagation to root logger
|
|
},
|
|
"uvicorn": {
|
|
"handlers": ["default"],
|
|
"level": "INFO",
|
|
"propagate": False,
|
|
},
|
|
"uvicorn.access": {
|
|
"handlers": ["default"],
|
|
"level": "INFO",
|
|
"propagate": False,
|
|
},
|
|
"uvicorn.error": {
|
|
"handlers": ["default"],
|
|
"level": "INFO",
|
|
"propagate": False,
|
|
},
|
|
},
|
|
}
|
|
|
|
|
|
def setup_logging():
|
|
logging.config.dictConfig(LOGGING_CONFIG)
|
|
|
|
|
|
# Document Processing Configuration
|
|
|
|
|
|
def get_document_processor_config() -> dict[str, Any]:
|
|
"""Get document processor configuration from environment.
|
|
|
|
Returns:
|
|
Dict with processor configs:
|
|
{
|
|
"enabled": bool,
|
|
"default_processor": str,
|
|
"processors": {
|
|
"unstructured": {...},
|
|
"tesseract": {...},
|
|
"custom": {...},
|
|
}
|
|
}
|
|
"""
|
|
config: dict[str, Any] = {
|
|
"enabled": os.getenv("ENABLE_DOCUMENT_PROCESSING", "false").lower() == "true",
|
|
"default_processor": os.getenv("DOCUMENT_PROCESSOR", "unstructured"),
|
|
"processors": {},
|
|
}
|
|
|
|
# Unstructured configuration
|
|
if os.getenv("ENABLE_UNSTRUCTURED", "false").lower() == "true":
|
|
config["processors"]["unstructured"] = {
|
|
"api_url": os.getenv("UNSTRUCTURED_API_URL", "http://unstructured:8000"),
|
|
"timeout": int(os.getenv("UNSTRUCTURED_TIMEOUT", "120")),
|
|
"strategy": os.getenv("UNSTRUCTURED_STRATEGY", "auto"),
|
|
"languages": [
|
|
lang.strip()
|
|
for lang in os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu").split(",")
|
|
if lang.strip()
|
|
],
|
|
"progress_interval": int(os.getenv("PROGRESS_INTERVAL", "10")),
|
|
}
|
|
|
|
# Tesseract configuration
|
|
if os.getenv("ENABLE_TESSERACT", "false").lower() == "true":
|
|
config["processors"]["tesseract"] = {
|
|
"tesseract_cmd": os.getenv("TESSERACT_CMD"), # None = auto-detect
|
|
"lang": os.getenv("TESSERACT_LANG", "eng"),
|
|
}
|
|
|
|
# Custom processor (via HTTP API)
|
|
if os.getenv("ENABLE_CUSTOM_PROCESSOR", "false").lower() == "true":
|
|
custom_url = os.getenv("CUSTOM_PROCESSOR_URL")
|
|
if custom_url:
|
|
supported_types_str = os.getenv("CUSTOM_PROCESSOR_TYPES", "application/pdf")
|
|
supported_types = {
|
|
t.strip() for t in supported_types_str.split(",") if t.strip()
|
|
}
|
|
|
|
config["processors"]["custom"] = {
|
|
"name": os.getenv("CUSTOM_PROCESSOR_NAME", "custom"),
|
|
"api_url": custom_url,
|
|
"api_key": os.getenv("CUSTOM_PROCESSOR_API_KEY"),
|
|
"timeout": int(os.getenv("CUSTOM_PROCESSOR_TIMEOUT", "60")),
|
|
"supported_types": supported_types,
|
|
}
|
|
|
|
return config
|
|
|
|
|
|
@dataclass
|
|
class Settings:
|
|
"""Application settings from environment variables."""
|
|
|
|
# OAuth/OIDC settings
|
|
oidc_discovery_url: Optional[str] = None
|
|
oidc_client_id: Optional[str] = None
|
|
oidc_client_secret: Optional[str] = None
|
|
oidc_issuer: Optional[str] = None
|
|
|
|
# Nextcloud settings
|
|
nextcloud_host: Optional[str] = None
|
|
nextcloud_username: Optional[str] = None
|
|
nextcloud_password: Optional[str] = None
|
|
|
|
# ADR-005: Token Audience Validation (required for OAuth mode)
|
|
nextcloud_mcp_server_url: Optional[str] = None # MCP server URL (used as audience)
|
|
nextcloud_resource_uri: Optional[str] = None # Nextcloud resource identifier
|
|
|
|
# Token verification endpoints
|
|
jwks_uri: Optional[str] = None
|
|
introspection_uri: Optional[str] = None
|
|
userinfo_uri: Optional[str] = None
|
|
|
|
# Progressive Consent settings (always enabled - no flag needed)
|
|
enable_token_exchange: bool = False
|
|
enable_offline_access: bool = False
|
|
|
|
# Token exchange cache settings
|
|
token_exchange_cache_ttl: int = 300 # seconds (5 minutes default)
|
|
|
|
# Token settings
|
|
token_encryption_key: Optional[str] = None
|
|
token_storage_db: Optional[str] = None
|
|
|
|
# Vector sync settings (ADR-007)
|
|
vector_sync_enabled: bool = False
|
|
vector_sync_scan_interval: int = 300 # seconds (5 minutes)
|
|
vector_sync_processor_workers: int = 3
|
|
vector_sync_queue_max_size: int = 10000
|
|
|
|
# Qdrant settings
|
|
qdrant_url: str = "http://qdrant:6333"
|
|
qdrant_api_key: Optional[str] = None
|
|
qdrant_collection: str = "nextcloud_content"
|
|
|
|
# Ollama settings (for embeddings)
|
|
ollama_base_url: Optional[str] = None
|
|
ollama_embedding_model: str = "nomic-embed-text"
|
|
ollama_verify_ssl: bool = True
|
|
|
|
|
|
def get_settings() -> Settings:
|
|
"""Get application settings from environment variables.
|
|
|
|
Returns:
|
|
Settings object with configuration values
|
|
"""
|
|
return Settings(
|
|
# OAuth/OIDC settings
|
|
oidc_discovery_url=os.getenv("OIDC_DISCOVERY_URL"),
|
|
oidc_client_id=os.getenv("OIDC_CLIENT_ID"),
|
|
oidc_client_secret=os.getenv("OIDC_CLIENT_SECRET"),
|
|
oidc_issuer=os.getenv("OIDC_ISSUER"),
|
|
# Nextcloud settings
|
|
nextcloud_host=os.getenv("NEXTCLOUD_HOST"),
|
|
nextcloud_username=os.getenv("NEXTCLOUD_USERNAME"),
|
|
nextcloud_password=os.getenv("NEXTCLOUD_PASSWORD"),
|
|
# ADR-005: Token Audience Validation
|
|
nextcloud_mcp_server_url=os.getenv("NEXTCLOUD_MCP_SERVER_URL"),
|
|
nextcloud_resource_uri=os.getenv("NEXTCLOUD_RESOURCE_URI"),
|
|
# Token verification endpoints
|
|
jwks_uri=os.getenv("JWKS_URI"),
|
|
introspection_uri=os.getenv("INTROSPECTION_URI"),
|
|
userinfo_uri=os.getenv("USERINFO_URI"),
|
|
# Progressive Consent settings (always enabled)
|
|
enable_token_exchange=(
|
|
os.getenv("ENABLE_TOKEN_EXCHANGE", "false").lower() == "true"
|
|
),
|
|
enable_offline_access=(
|
|
os.getenv("ENABLE_OFFLINE_ACCESS", "false").lower() == "true"
|
|
),
|
|
# Token exchange cache settings
|
|
token_exchange_cache_ttl=int(os.getenv("TOKEN_EXCHANGE_CACHE_TTL", "300")),
|
|
# Token settings
|
|
token_encryption_key=os.getenv("TOKEN_ENCRYPTION_KEY"),
|
|
token_storage_db=os.getenv("TOKEN_STORAGE_DB", "/tmp/tokens.db"),
|
|
# Vector sync settings (ADR-007)
|
|
vector_sync_enabled=(
|
|
os.getenv("VECTOR_SYNC_ENABLED", "false").lower() == "true"
|
|
),
|
|
vector_sync_scan_interval=int(os.getenv("VECTOR_SYNC_SCAN_INTERVAL", "300")),
|
|
vector_sync_processor_workers=int(
|
|
os.getenv("VECTOR_SYNC_PROCESSOR_WORKERS", "3")
|
|
),
|
|
vector_sync_queue_max_size=int(
|
|
os.getenv("VECTOR_SYNC_QUEUE_MAX_SIZE", "10000")
|
|
),
|
|
# Qdrant settings
|
|
qdrant_url=os.getenv("QDRANT_URL", "http://qdrant:6333"),
|
|
qdrant_api_key=os.getenv("QDRANT_API_KEY"),
|
|
qdrant_collection=os.getenv("QDRANT_COLLECTION", "nextcloud_content"),
|
|
# Ollama settings
|
|
ollama_base_url=os.getenv("OLLAMA_BASE_URL"),
|
|
ollama_embedding_model=os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text"),
|
|
ollama_verify_ssl=os.getenv("OLLAMA_VERIFY_SSL", "true").lower() == "true",
|
|
)
|