Files
nextcloud-mcp-server/nextcloud_mcp_server/observability/metrics.py
T
Chris Coutinho 578de4d7d6 feat(observability): Add comprehensive monitoring with Prometheus and OpenTelemetry
- Add Prometheus metrics for HTTP, MCP tools, Nextcloud API, OAuth, vector sync, and DB operations
- Add OpenTelemetry distributed tracing with OTLP export
- Add structured JSON logging with trace context correlation
- Add ObservabilityMiddleware for automatic HTTP instrumentation
- Add app_name attribute to all client classes for per-app metrics
- Add configuration for metrics, tracing, and logging via environment variables
- Add documentation in docs/observability.md
- Fix graceful degradation when tracing is disabled (default state)
- Fix uvicorn logging configuration to use observability formatters

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-09 08:54:04 +01:00

356 lines
11 KiB
Python

"""
Prometheus metrics for the Nextcloud MCP Server.
This module defines all Prometheus metrics for monitoring server health, performance,
and resource usage. Metrics are organized by category:
- HTTP Server Metrics (RED: Rate, Errors, Duration)
- MCP Tool Metrics (per-tool invocation tracking)
- MCP Resource Metrics
- Nextcloud API Client Metrics
- OAuth Flow Metrics
- Vector Sync Metrics (conditional on feature flag)
- Database Operation Metrics
- External Dependency Health Metrics
"""
import logging
from prometheus_client import (
CONTENT_TYPE_LATEST,
REGISTRY,
Counter,
Gauge,
Histogram,
generate_latest,
)
from starlette.requests import Request
from starlette.responses import Response
logger = logging.getLogger(__name__)
# =============================================================================
# HTTP Server Metrics (RED + System)
# =============================================================================
http_requests_total = Counter(
"mcp_http_requests_total",
"Total HTTP requests received",
["method", "endpoint", "status_code"],
)
http_request_duration_seconds = Histogram(
"mcp_http_request_duration_seconds",
"HTTP request latency in seconds",
["method", "endpoint"],
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0),
)
http_requests_in_progress = Gauge(
"mcp_http_requests_in_progress",
"Number of HTTP requests currently being processed",
["method", "endpoint"],
)
# =============================================================================
# MCP Tool Metrics
# =============================================================================
mcp_tool_calls_total = Counter(
"mcp_tool_calls_total",
"Total MCP tool invocations",
["tool_name", "status"], # status: success | error
)
mcp_tool_duration_seconds = Histogram(
"mcp_tool_duration_seconds",
"MCP tool execution duration in seconds",
["tool_name"],
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0),
)
mcp_tool_errors_total = Counter(
"mcp_tool_errors_total",
"Total MCP tool errors by type",
["tool_name", "error_type"],
)
# =============================================================================
# MCP Resource Metrics
# =============================================================================
mcp_resource_requests_total = Counter(
"mcp_resource_requests_total",
"Total MCP resource requests",
["resource_uri", "status"],
)
mcp_resource_duration_seconds = Histogram(
"mcp_resource_duration_seconds",
"MCP resource request duration in seconds",
["resource_uri"],
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5),
)
# =============================================================================
# Nextcloud API Client Metrics
# =============================================================================
nextcloud_api_requests_total = Counter(
"mcp_nextcloud_api_requests_total",
"Total Nextcloud API requests",
["app", "method", "status_code"], # app: notes, calendar, contacts, etc.
)
nextcloud_api_duration_seconds = Histogram(
"mcp_nextcloud_api_duration_seconds",
"Nextcloud API request duration in seconds",
["app", "method"],
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0),
)
nextcloud_api_retries_total = Counter(
"mcp_nextcloud_api_retries_total",
"Total Nextcloud API retries",
["app", "reason"], # reason: 429 | timeout | connection_error
)
# =============================================================================
# OAuth Flow Metrics
# =============================================================================
oauth_token_validations_total = Counter(
"mcp_oauth_token_validations_total",
"Total OAuth token validation attempts",
["method", "result"], # method: introspect | jwt; result: valid | invalid | error
)
oauth_token_exchange_total = Counter(
"mcp_oauth_token_exchange_total",
"Total OAuth token exchange operations (RFC 8693)",
["status"], # status: success | error
)
oauth_token_cache_hits_total = Counter(
"mcp_oauth_token_cache_hits_total",
"Total OAuth token cache lookups",
["hit"], # hit: true | false
)
oauth_refresh_token_operations_total = Counter(
"mcp_oauth_refresh_token_operations_total",
"Total refresh token storage operations",
[
"operation",
"status",
], # operation: store | retrieve | delete; status: success | error
)
# =============================================================================
# Vector Sync Metrics (optional feature)
# =============================================================================
vector_sync_documents_scanned_total = Counter(
"mcp_vector_sync_documents_scanned_total",
"Total documents scanned for vector sync",
)
vector_sync_documents_processed_total = Counter(
"mcp_vector_sync_documents_processed_total",
"Total documents processed for vector sync",
["status"], # status: success | error
)
vector_sync_processing_duration_seconds = Histogram(
"mcp_vector_sync_processing_duration_seconds",
"Document processing duration in seconds",
buckets=(0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0),
)
vector_sync_queue_size = Gauge(
"mcp_vector_sync_queue_size",
"Current number of documents in processing queue",
)
qdrant_operations_total = Counter(
"mcp_qdrant_operations_total",
"Total Qdrant vector database operations",
[
"operation",
"status",
], # operation: upsert | search | delete; status: success | error
)
# =============================================================================
# Database Metrics
# =============================================================================
db_operations_total = Counter(
"mcp_db_operations_total",
"Total database operations",
["db", "operation", "status"], # db: sqlite | qdrant; operation varies
)
db_operation_duration_seconds = Histogram(
"mcp_db_operation_duration_seconds",
"Database operation duration in seconds",
["db", "operation"],
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0),
)
# =============================================================================
# External Dependency Health Metrics
# =============================================================================
dependency_health = Gauge(
"mcp_dependency_health",
"External dependency health status (1=up, 0=down)",
["dependency"], # dependency: nextcloud | keycloak | qdrant | unstructured
)
dependency_check_duration_seconds = Histogram(
"mcp_dependency_check_duration_seconds",
"Dependency health check duration in seconds",
["dependency"],
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5),
)
# =============================================================================
# Metrics Setup and HTTP Handler
# =============================================================================
def setup_metrics() -> None:
"""
Initialize Prometheus metrics collection.
This function should be called once during application startup.
It currently doesn't require any initialization beyond module-level
metric definitions, but is provided for consistency and future extensibility.
"""
logger.info("Prometheus metrics initialized")
async def get_metrics_handler(request: Request) -> Response:
"""
HTTP handler for the /metrics endpoint.
Args:
request: Starlette request object (unused, but required by signature)
Returns:
Response containing Prometheus metrics in text format
"""
metrics_data = generate_latest(REGISTRY)
return Response(content=metrics_data, media_type=CONTENT_TYPE_LATEST)
# =============================================================================
# Convenience Functions for Common Metric Updates
# =============================================================================
def record_tool_call(tool_name: str, duration: float, status: str = "success") -> None:
"""
Record metrics for an MCP tool call.
Args:
tool_name: Name of the MCP tool
duration: Execution duration in seconds
status: "success" or "error"
"""
mcp_tool_calls_total.labels(tool_name=tool_name, status=status).inc()
mcp_tool_duration_seconds.labels(tool_name=tool_name).observe(duration)
def record_tool_error(tool_name: str, error_type: str) -> None:
"""
Record an MCP tool error.
Args:
tool_name: Name of the MCP tool
error_type: Type of error (e.g., "HTTPStatusError", "ValueError")
"""
mcp_tool_errors_total.labels(tool_name=tool_name, error_type=error_type).inc()
def record_nextcloud_api_call(
app: str,
method: str,
status_code: int,
duration: float,
) -> None:
"""
Record metrics for a Nextcloud API call.
Args:
app: Nextcloud app name (notes, calendar, contacts, etc.)
method: HTTP method (GET, POST, PUT, DELETE, PROPFIND, etc.)
status_code: HTTP status code
duration: Request duration in seconds
"""
nextcloud_api_requests_total.labels(
app=app, method=method, status_code=str(status_code)
).inc()
nextcloud_api_duration_seconds.labels(app=app, method=method).observe(duration)
def record_nextcloud_api_retry(app: str, reason: str) -> None:
"""
Record a Nextcloud API retry.
Args:
app: Nextcloud app name
reason: Retry reason (429, timeout, connection_error)
"""
nextcloud_api_retries_total.labels(app=app, reason=reason).inc()
def record_oauth_token_validation(method: str, result: str) -> None:
"""
Record an OAuth token validation.
Args:
method: Validation method ("introspect" or "jwt")
result: Validation result ("valid", "invalid", or "error")
"""
oauth_token_validations_total.labels(method=method, result=result).inc()
def record_db_operation(
db: str, operation: str, duration: float, status: str = "success"
) -> None:
"""
Record a database operation.
Args:
db: Database type ("sqlite" or "qdrant")
operation: Operation type (e.g., "insert", "select", "upsert", "search")
duration: Operation duration in seconds
status: "success" or "error"
"""
db_operations_total.labels(db=db, operation=operation, status=status).inc()
db_operation_duration_seconds.labels(db=db, operation=operation).observe(duration)
def set_dependency_health(dependency: str, is_healthy: bool) -> None:
"""
Update external dependency health status.
Args:
dependency: Dependency name (nextcloud, keycloak, qdrant, unstructured)
is_healthy: True if dependency is healthy, False otherwise
"""
dependency_health.labels(dependency=dependency).set(1 if is_healthy else 0)
def record_dependency_check(dependency: str, duration: float) -> None:
"""
Record a dependency health check duration.
Args:
dependency: Dependency name
duration: Check duration in seconds
"""
dependency_check_duration_seconds.labels(dependency=dependency).observe(duration)