Files
nextcloud-mcp-server/nextcloud_mcp_server/config.py
T
Chris Coutinho 8f45e996e8 feat: implement vector sync scanner and processor (ADR-007 Phase 2)
Implements background vector database synchronization using anyio
TaskGroups for BasicAuth mode with single-user credentials.

Scanner Implementation:
- Periodic document discovery (hourly, configurable)
- Timestamp-based change detection (Nextcloud vs Qdrant)
- Wake event for immediate scanning on-demand
- Supports both initial sync (all docs) and incremental sync (changes only)
- Detects deleted documents and queues for removal

Processor Implementation:
- Concurrent document processing pool (3 workers default)
- I/O-bound embedding generation via Ollama API
- Retry logic with exponential backoff (3 retries)
- Document chunking (512 words, 50-word overlap)
- Handles both index and delete operations
- Upserts vectors to Qdrant with rich metadata

App Lifespan Integration:
- Extended AppContext with background task state
- Modified app_lifespan_basic() to start tasks via anyio TaskGroups
- Graceful shutdown with coordinated task cancellation
- Only activates when VECTOR_SYNC_ENABLED=true

Embedding Service:
- OllamaEmbeddingProvider with TLS support
- Singleton pattern for shared client instances
- Batch embedding support for efficiency
- Auto-detects embedding dimension (768 for nomic-embed-text)

Qdrant Client:
- Async client wrapper with singleton pattern
- Auto-creates collection on first use
- COSINE distance metric for semantic similarity
- Integrates with embedding service for dimension detection

Health Check Enhancement:
- Added Qdrant status check to /health/ready endpoint
- Only checks when VECTOR_SYNC_ENABLED=true
- 2-second timeout for health probe
- Reports connection errors with details

Configuration:
- VECTOR_SYNC_ENABLED: Enable background sync
- VECTOR_SYNC_SCAN_INTERVAL: Scanner frequency (3600s default)
- VECTOR_SYNC_PROCESSOR_WORKERS: Concurrent processors (3 default)
- QDRANT_URL, QDRANT_API_KEY, QDRANT_COLLECTION: Vector DB config
- OLLAMA_BASE_URL, OLLAMA_EMBEDDING_MODEL: Embedding service config

Dependencies Added:
- qdrant-client>=1.7.0: Vector database client

Docker Compose:
- Added Qdrant service with health check
- Exposed ports 6333 (REST) and 6334 (gRPC)
- Configured MCP service with vector sync environment
- Added qdrant-data volume for persistence

Known Issue:
- FastMCP lifespan not triggering for streamable-http transport
- Background tasks will start once lifespan integration is complete
- Lifespan triggers on MCP session establishment, not server startup

Related: ADR-007 Background Vector Database Synchronization

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-08 21:14:38 +01:00

231 lines
7.9 KiB
Python

import logging.config
import os
from dataclasses import dataclass
from typing import Any, Optional
LOGGING_CONFIG = {
"version": 1,
"disable_existing_loggers": False,
"handlers": {
"default": {
"class": "logging.StreamHandler",
"formatter": "http",
},
},
"formatters": {
"http": {
"format": "%(levelname)s [%(asctime)s] %(name)s - %(message)s",
"datefmt": "%Y-%m-%d %H:%M:%S",
},
},
"loggers": {
"": {
"handlers": ["default"],
"level": "INFO",
},
"httpx": {
"handlers": ["default"],
"level": "INFO",
"propagate": False, # Prevent propagation to root logger
},
"httpcore": {
"handlers": ["default"],
"level": "INFO",
"propagate": False, # Prevent propagation to root logger
},
"uvicorn": {
"handlers": ["default"],
"level": "INFO",
"propagate": False,
},
"uvicorn.access": {
"handlers": ["default"],
"level": "INFO",
"propagate": False,
},
"uvicorn.error": {
"handlers": ["default"],
"level": "INFO",
"propagate": False,
},
},
}
def setup_logging():
logging.config.dictConfig(LOGGING_CONFIG)
# Document Processing Configuration
def get_document_processor_config() -> dict[str, Any]:
"""Get document processor configuration from environment.
Returns:
Dict with processor configs:
{
"enabled": bool,
"default_processor": str,
"processors": {
"unstructured": {...},
"tesseract": {...},
"custom": {...},
}
}
"""
config: dict[str, Any] = {
"enabled": os.getenv("ENABLE_DOCUMENT_PROCESSING", "false").lower() == "true",
"default_processor": os.getenv("DOCUMENT_PROCESSOR", "unstructured"),
"processors": {},
}
# Unstructured configuration
if os.getenv("ENABLE_UNSTRUCTURED", "false").lower() == "true":
config["processors"]["unstructured"] = {
"api_url": os.getenv("UNSTRUCTURED_API_URL", "http://unstructured:8000"),
"timeout": int(os.getenv("UNSTRUCTURED_TIMEOUT", "120")),
"strategy": os.getenv("UNSTRUCTURED_STRATEGY", "auto"),
"languages": [
lang.strip()
for lang in os.getenv("UNSTRUCTURED_LANGUAGES", "eng,deu").split(",")
if lang.strip()
],
"progress_interval": int(os.getenv("PROGRESS_INTERVAL", "10")),
}
# Tesseract configuration
if os.getenv("ENABLE_TESSERACT", "false").lower() == "true":
config["processors"]["tesseract"] = {
"tesseract_cmd": os.getenv("TESSERACT_CMD"), # None = auto-detect
"lang": os.getenv("TESSERACT_LANG", "eng"),
}
# Custom processor (via HTTP API)
if os.getenv("ENABLE_CUSTOM_PROCESSOR", "false").lower() == "true":
custom_url = os.getenv("CUSTOM_PROCESSOR_URL")
if custom_url:
supported_types_str = os.getenv("CUSTOM_PROCESSOR_TYPES", "application/pdf")
supported_types = {
t.strip() for t in supported_types_str.split(",") if t.strip()
}
config["processors"]["custom"] = {
"name": os.getenv("CUSTOM_PROCESSOR_NAME", "custom"),
"api_url": custom_url,
"api_key": os.getenv("CUSTOM_PROCESSOR_API_KEY"),
"timeout": int(os.getenv("CUSTOM_PROCESSOR_TIMEOUT", "60")),
"supported_types": supported_types,
}
return config
@dataclass
class Settings:
"""Application settings from environment variables."""
# OAuth/OIDC settings
oidc_discovery_url: Optional[str] = None
oidc_client_id: Optional[str] = None
oidc_client_secret: Optional[str] = None
oidc_issuer: Optional[str] = None
# Nextcloud settings
nextcloud_host: Optional[str] = None
nextcloud_username: Optional[str] = None
nextcloud_password: Optional[str] = None
# ADR-005: Token Audience Validation (required for OAuth mode)
nextcloud_mcp_server_url: Optional[str] = None # MCP server URL (used as audience)
nextcloud_resource_uri: Optional[str] = None # Nextcloud resource identifier
# Token verification endpoints
jwks_uri: Optional[str] = None
introspection_uri: Optional[str] = None
userinfo_uri: Optional[str] = None
# Progressive Consent settings (always enabled - no flag needed)
enable_token_exchange: bool = False
enable_offline_access: bool = False
# Token exchange cache settings
token_exchange_cache_ttl: int = 300 # seconds (5 minutes default)
# Token settings
token_encryption_key: Optional[str] = None
token_storage_db: Optional[str] = None
# Vector sync settings (ADR-007)
vector_sync_enabled: bool = False
vector_sync_scan_interval: int = 3600 # seconds
vector_sync_processor_workers: int = 3
vector_sync_queue_max_size: int = 10000
# Qdrant settings
qdrant_url: str = "http://qdrant:6333"
qdrant_api_key: Optional[str] = None
qdrant_collection: str = "nextcloud_content"
# Ollama settings (for embeddings)
ollama_base_url: Optional[str] = None
ollama_embedding_model: str = "nomic-embed-text"
ollama_verify_ssl: bool = True
def get_settings() -> Settings:
"""Get application settings from environment variables.
Returns:
Settings object with configuration values
"""
return Settings(
# OAuth/OIDC settings
oidc_discovery_url=os.getenv("OIDC_DISCOVERY_URL"),
oidc_client_id=os.getenv("OIDC_CLIENT_ID"),
oidc_client_secret=os.getenv("OIDC_CLIENT_SECRET"),
oidc_issuer=os.getenv("OIDC_ISSUER"),
# Nextcloud settings
nextcloud_host=os.getenv("NEXTCLOUD_HOST"),
nextcloud_username=os.getenv("NEXTCLOUD_USERNAME"),
nextcloud_password=os.getenv("NEXTCLOUD_PASSWORD"),
# ADR-005: Token Audience Validation
nextcloud_mcp_server_url=os.getenv("NEXTCLOUD_MCP_SERVER_URL"),
nextcloud_resource_uri=os.getenv("NEXTCLOUD_RESOURCE_URI"),
# Token verification endpoints
jwks_uri=os.getenv("JWKS_URI"),
introspection_uri=os.getenv("INTROSPECTION_URI"),
userinfo_uri=os.getenv("USERINFO_URI"),
# Progressive Consent settings (always enabled)
enable_token_exchange=(
os.getenv("ENABLE_TOKEN_EXCHANGE", "false").lower() == "true"
),
enable_offline_access=(
os.getenv("ENABLE_OFFLINE_ACCESS", "false").lower() == "true"
),
# Token exchange cache settings
token_exchange_cache_ttl=int(os.getenv("TOKEN_EXCHANGE_CACHE_TTL", "300")),
# Token settings
token_encryption_key=os.getenv("TOKEN_ENCRYPTION_KEY"),
token_storage_db=os.getenv("TOKEN_STORAGE_DB", "/tmp/tokens.db"),
# Vector sync settings (ADR-007)
vector_sync_enabled=(
os.getenv("VECTOR_SYNC_ENABLED", "false").lower() == "true"
),
vector_sync_scan_interval=int(os.getenv("VECTOR_SYNC_SCAN_INTERVAL", "3600")),
vector_sync_processor_workers=int(
os.getenv("VECTOR_SYNC_PROCESSOR_WORKERS", "3")
),
vector_sync_queue_max_size=int(
os.getenv("VECTOR_SYNC_QUEUE_MAX_SIZE", "10000")
),
# Qdrant settings
qdrant_url=os.getenv("QDRANT_URL", "http://qdrant:6333"),
qdrant_api_key=os.getenv("QDRANT_API_KEY"),
qdrant_collection=os.getenv("QDRANT_COLLECTION", "nextcloud_content"),
# Ollama settings
ollama_base_url=os.getenv("OLLAMA_BASE_URL"),
ollama_embedding_model=os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text"),
ollama_verify_ssl=os.getenv("OLLAMA_VERIFY_SSL", "true").lower() == "true",
)