a11ae9c027
Enable ruff PLC0415 rule for all source files (tests excluded via per-file-ignores). Move 136 inline imports to top-level across 33 files. 8 imports suppressed with noqa for legitimate reasons: circular dependencies (client/__init__.py, context.py), optional dependency guards (app.py document processors, auth/userinfo_routes.py), and post-env-setup imports (smithery_main.py). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
467 lines
14 KiB
Python
467 lines
14 KiB
Python
"""
|
|
Prometheus metrics for the Nextcloud MCP Server.
|
|
|
|
This module defines all Prometheus metrics for monitoring server health, performance,
|
|
and resource usage. Metrics are organized by category:
|
|
|
|
- HTTP Server Metrics (RED: Rate, Errors, Duration)
|
|
- MCP Tool Metrics (per-tool invocation tracking)
|
|
- MCP Resource Metrics
|
|
- Nextcloud API Client Metrics
|
|
- OAuth Flow Metrics
|
|
- Vector Sync Metrics (conditional on feature flag)
|
|
- Database Operation Metrics
|
|
- External Dependency Health Metrics
|
|
"""
|
|
|
|
import functools
|
|
import logging
|
|
import time
|
|
|
|
from prometheus_client import (
|
|
Counter,
|
|
Gauge,
|
|
Histogram,
|
|
start_http_server,
|
|
)
|
|
|
|
from nextcloud_mcp_server.observability.tracing import trace_operation
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# =============================================================================
|
|
# HTTP Server Metrics (RED + System)
|
|
# =============================================================================
|
|
|
|
http_requests_total = Counter(
|
|
"mcp_http_requests_total",
|
|
"Total HTTP requests received",
|
|
["method", "endpoint", "status_code"],
|
|
)
|
|
|
|
http_request_duration_seconds = Histogram(
|
|
"mcp_http_request_duration_seconds",
|
|
"HTTP request latency in seconds",
|
|
["method", "endpoint"],
|
|
buckets=(0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0),
|
|
)
|
|
|
|
http_requests_in_progress = Gauge(
|
|
"mcp_http_requests_in_progress",
|
|
"Number of HTTP requests currently being processed",
|
|
["method", "endpoint"],
|
|
)
|
|
|
|
# =============================================================================
|
|
# MCP Tool Metrics
|
|
# =============================================================================
|
|
|
|
mcp_tool_calls_total = Counter(
|
|
"mcp_tool_calls_total",
|
|
"Total MCP tool invocations",
|
|
["tool_name", "status"], # status: success | error
|
|
)
|
|
|
|
mcp_tool_duration_seconds = Histogram(
|
|
"mcp_tool_duration_seconds",
|
|
"MCP tool execution duration in seconds",
|
|
["tool_name"],
|
|
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0),
|
|
)
|
|
|
|
mcp_tool_errors_total = Counter(
|
|
"mcp_tool_errors_total",
|
|
"Total MCP tool errors by type",
|
|
["tool_name", "error_type"],
|
|
)
|
|
|
|
# =============================================================================
|
|
# MCP Resource Metrics
|
|
# =============================================================================
|
|
|
|
mcp_resource_requests_total = Counter(
|
|
"mcp_resource_requests_total",
|
|
"Total MCP resource requests",
|
|
["resource_uri", "status"],
|
|
)
|
|
|
|
mcp_resource_duration_seconds = Histogram(
|
|
"mcp_resource_duration_seconds",
|
|
"MCP resource request duration in seconds",
|
|
["resource_uri"],
|
|
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5),
|
|
)
|
|
|
|
# =============================================================================
|
|
# Nextcloud API Client Metrics
|
|
# =============================================================================
|
|
|
|
nextcloud_api_requests_total = Counter(
|
|
"mcp_nextcloud_api_requests_total",
|
|
"Total Nextcloud API requests",
|
|
["app", "method", "status_code"], # app: notes, calendar, contacts, etc.
|
|
)
|
|
|
|
nextcloud_api_duration_seconds = Histogram(
|
|
"mcp_nextcloud_api_duration_seconds",
|
|
"Nextcloud API request duration in seconds",
|
|
["app", "method"],
|
|
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0),
|
|
)
|
|
|
|
nextcloud_api_retries_total = Counter(
|
|
"mcp_nextcloud_api_retries_total",
|
|
"Total Nextcloud API retries",
|
|
["app", "reason"], # reason: 429 | timeout | connection_error
|
|
)
|
|
|
|
# =============================================================================
|
|
# OAuth Flow Metrics
|
|
# =============================================================================
|
|
|
|
oauth_token_validations_total = Counter(
|
|
"mcp_oauth_token_validations_total",
|
|
"Total OAuth token validation attempts",
|
|
["method", "result"], # method: introspect | jwt; result: valid | invalid | error
|
|
)
|
|
|
|
oauth_token_exchange_total = Counter(
|
|
"mcp_oauth_token_exchange_total",
|
|
"Total OAuth token exchange operations (RFC 8693)",
|
|
["status"], # status: success | error
|
|
)
|
|
|
|
oauth_token_cache_hits_total = Counter(
|
|
"mcp_oauth_token_cache_hits_total",
|
|
"Total OAuth token cache lookups",
|
|
["hit"], # hit: true | false
|
|
)
|
|
|
|
oauth_refresh_token_operations_total = Counter(
|
|
"mcp_oauth_refresh_token_operations_total",
|
|
"Total refresh token storage operations",
|
|
[
|
|
"operation",
|
|
"status",
|
|
], # operation: store | retrieve | delete; status: success | error
|
|
)
|
|
|
|
# =============================================================================
|
|
# Vector Sync Metrics (optional feature)
|
|
# =============================================================================
|
|
|
|
vector_sync_documents_scanned_total = Counter(
|
|
"mcp_vector_sync_documents_scanned_total",
|
|
"Total documents scanned for vector sync",
|
|
)
|
|
|
|
vector_sync_documents_processed_total = Counter(
|
|
"mcp_vector_sync_documents_processed_total",
|
|
"Total documents processed for vector sync",
|
|
["status"], # status: success | error
|
|
)
|
|
|
|
vector_sync_processing_duration_seconds = Histogram(
|
|
"mcp_vector_sync_processing_duration_seconds",
|
|
"Document processing duration in seconds",
|
|
buckets=(0.1, 0.5, 1.0, 2.5, 5.0, 10.0, 30.0, 60.0),
|
|
)
|
|
|
|
vector_sync_queue_size = Gauge(
|
|
"mcp_vector_sync_queue_size",
|
|
"Current number of documents in processing queue",
|
|
)
|
|
|
|
qdrant_operations_total = Counter(
|
|
"mcp_qdrant_operations_total",
|
|
"Total Qdrant vector database operations",
|
|
[
|
|
"operation",
|
|
"status",
|
|
], # operation: upsert | search | delete; status: success | error
|
|
)
|
|
|
|
# =============================================================================
|
|
# Database Metrics
|
|
# =============================================================================
|
|
|
|
db_operations_total = Counter(
|
|
"mcp_db_operations_total",
|
|
"Total database operations",
|
|
["db", "operation", "status"], # db: sqlite | qdrant; operation varies
|
|
)
|
|
|
|
db_operation_duration_seconds = Histogram(
|
|
"mcp_db_operation_duration_seconds",
|
|
"Database operation duration in seconds",
|
|
["db", "operation"],
|
|
buckets=(0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0),
|
|
)
|
|
|
|
# =============================================================================
|
|
# External Dependency Health Metrics
|
|
# =============================================================================
|
|
|
|
dependency_health = Gauge(
|
|
"mcp_dependency_health",
|
|
"External dependency health status (1=up, 0=down)",
|
|
["dependency"], # dependency: nextcloud | keycloak | qdrant | unstructured
|
|
)
|
|
|
|
dependency_check_duration_seconds = Histogram(
|
|
"mcp_dependency_check_duration_seconds",
|
|
"Dependency health check duration in seconds",
|
|
["dependency"],
|
|
buckets=(0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5),
|
|
)
|
|
|
|
# =============================================================================
|
|
# Metrics Setup and HTTP Handler
|
|
# =============================================================================
|
|
|
|
|
|
def setup_metrics(port: int = 9090) -> None:
|
|
"""
|
|
Initialize Prometheus metrics collection and start HTTP server.
|
|
|
|
Starts a dedicated HTTP server on the specified port to serve metrics.
|
|
This server runs in a separate thread and is isolated from the main application.
|
|
|
|
Args:
|
|
port: Port to serve metrics on (default: 9090)
|
|
|
|
Note:
|
|
Metrics endpoint (/metrics) is ONLY accessible on this dedicated port,
|
|
not on the main application HTTP port. This is a security best practice
|
|
to prevent external exposure of metrics.
|
|
"""
|
|
try:
|
|
start_http_server(port)
|
|
logger.info(f"Prometheus metrics server started on port {port}")
|
|
except OSError as e:
|
|
if "Address already in use" in str(e):
|
|
logger.warning(
|
|
f"Metrics port {port} already in use (metrics server likely already running)"
|
|
)
|
|
else:
|
|
logger.error(f"Failed to start metrics server on port {port}: {e}")
|
|
raise
|
|
|
|
|
|
# =============================================================================
|
|
# Convenience Functions for Common Metric Updates
|
|
# =============================================================================
|
|
|
|
|
|
def record_tool_call(tool_name: str, duration: float, status: str = "success") -> None:
|
|
"""
|
|
Record metrics for an MCP tool call.
|
|
|
|
Args:
|
|
tool_name: Name of the MCP tool
|
|
duration: Execution duration in seconds
|
|
status: "success" or "error"
|
|
"""
|
|
mcp_tool_calls_total.labels(tool_name=tool_name, status=status).inc()
|
|
mcp_tool_duration_seconds.labels(tool_name=tool_name).observe(duration)
|
|
|
|
|
|
def record_tool_error(tool_name: str, error_type: str) -> None:
|
|
"""
|
|
Record an MCP tool error.
|
|
|
|
Args:
|
|
tool_name: Name of the MCP tool
|
|
error_type: Type of error (e.g., "HTTPStatusError", "ValueError")
|
|
"""
|
|
mcp_tool_errors_total.labels(tool_name=tool_name, error_type=error_type).inc()
|
|
|
|
|
|
def record_nextcloud_api_call(
|
|
app: str,
|
|
method: str,
|
|
status_code: int,
|
|
duration: float,
|
|
) -> None:
|
|
"""
|
|
Record metrics for a Nextcloud API call.
|
|
|
|
Args:
|
|
app: Nextcloud app name (notes, calendar, contacts, etc.)
|
|
method: HTTP method (GET, POST, PUT, DELETE, PROPFIND, etc.)
|
|
status_code: HTTP status code
|
|
duration: Request duration in seconds
|
|
"""
|
|
nextcloud_api_requests_total.labels(
|
|
app=app, method=method, status_code=str(status_code)
|
|
).inc()
|
|
nextcloud_api_duration_seconds.labels(app=app, method=method).observe(duration)
|
|
|
|
|
|
def record_nextcloud_api_retry(app: str, reason: str) -> None:
|
|
"""
|
|
Record a Nextcloud API retry.
|
|
|
|
Args:
|
|
app: Nextcloud app name
|
|
reason: Retry reason (429, timeout, connection_error)
|
|
"""
|
|
nextcloud_api_retries_total.labels(app=app, reason=reason).inc()
|
|
|
|
|
|
def record_oauth_token_validation(method: str, result: str) -> None:
|
|
"""
|
|
Record an OAuth token validation.
|
|
|
|
Args:
|
|
method: Validation method ("introspect" or "jwt")
|
|
result: Validation result ("valid", "invalid", or "error")
|
|
"""
|
|
oauth_token_validations_total.labels(method=method, result=result).inc()
|
|
|
|
|
|
def record_db_operation(
|
|
db: str, operation: str, duration: float, status: str = "success"
|
|
) -> None:
|
|
"""
|
|
Record a database operation.
|
|
|
|
Args:
|
|
db: Database type ("sqlite" or "qdrant")
|
|
operation: Operation type (e.g., "insert", "select", "upsert", "search")
|
|
duration: Operation duration in seconds
|
|
status: "success" or "error"
|
|
"""
|
|
db_operations_total.labels(db=db, operation=operation, status=status).inc()
|
|
db_operation_duration_seconds.labels(db=db, operation=operation).observe(duration)
|
|
|
|
|
|
def set_dependency_health(dependency: str, is_healthy: bool) -> None:
|
|
"""
|
|
Update external dependency health status.
|
|
|
|
Args:
|
|
dependency: Dependency name (nextcloud, keycloak, qdrant, unstructured)
|
|
is_healthy: True if dependency is healthy, False otherwise
|
|
"""
|
|
dependency_health.labels(dependency=dependency).set(1 if is_healthy else 0)
|
|
|
|
|
|
def record_dependency_check(dependency: str, duration: float) -> None:
|
|
"""
|
|
Record a dependency health check duration.
|
|
|
|
Args:
|
|
dependency: Dependency name
|
|
duration: Check duration in seconds
|
|
"""
|
|
dependency_check_duration_seconds.labels(dependency=dependency).observe(duration)
|
|
|
|
|
|
def record_vector_sync_scan(documents_found: int) -> None:
|
|
"""
|
|
Record documents scanned during vector sync.
|
|
|
|
Args:
|
|
documents_found: Number of documents discovered in scan
|
|
"""
|
|
vector_sync_documents_scanned_total.inc(documents_found)
|
|
|
|
|
|
def record_vector_sync_processing(duration: float, status: str = "success") -> None:
|
|
"""
|
|
Record document processing with duration and status.
|
|
|
|
Args:
|
|
duration: Processing duration in seconds
|
|
status: "success" or "error"
|
|
"""
|
|
vector_sync_documents_processed_total.labels(status=status).inc()
|
|
vector_sync_processing_duration_seconds.observe(duration)
|
|
|
|
|
|
def record_qdrant_operation(operation: str, status: str = "success") -> None:
|
|
"""
|
|
Record Qdrant vector database operation.
|
|
|
|
Args:
|
|
operation: Operation type ("upsert", "search", "delete")
|
|
status: "success" or "error"
|
|
"""
|
|
qdrant_operations_total.labels(operation=operation, status=status).inc()
|
|
|
|
|
|
def update_vector_sync_queue_size(size: int) -> None:
|
|
"""
|
|
Update vector sync queue size gauge.
|
|
|
|
Args:
|
|
size: Current queue size
|
|
"""
|
|
vector_sync_queue_size.set(size)
|
|
|
|
|
|
# =============================================================================
|
|
# Decorator for Automatic Tool Instrumentation
|
|
# =============================================================================
|
|
|
|
|
|
def instrument_tool(func):
|
|
"""
|
|
Decorator to automatically instrument MCP tool functions with metrics and tracing.
|
|
|
|
Wraps async tool functions to record execution time, success/error status, and
|
|
create OpenTelemetry trace spans. Compatible with @mcp.tool() and @require_scopes()
|
|
decorators.
|
|
|
|
Usage:
|
|
@mcp.tool()
|
|
@require_scopes("notes:write")
|
|
@instrument_tool
|
|
async def nc_notes_create_note(...):
|
|
...
|
|
|
|
Args:
|
|
func: The async function to instrument
|
|
|
|
Returns:
|
|
Wrapped function with metrics and tracing instrumentation
|
|
"""
|
|
|
|
@functools.wraps(func)
|
|
async def wrapper(*args, **kwargs):
|
|
tool_name = func.__name__
|
|
start_time = time.time()
|
|
|
|
# Extract tool arguments for tracing (sanitize sensitive fields)
|
|
# kwargs contains the actual arguments passed to the tool
|
|
tool_args = {
|
|
k: v
|
|
for k, v in kwargs.items()
|
|
if k not in ("password", "token", "secret", "api_key", "etag", "ctx")
|
|
}
|
|
|
|
# Create trace span with metrics collection
|
|
with trace_operation(
|
|
f"mcp.tool.{tool_name}",
|
|
attributes={
|
|
"mcp.tool.name": tool_name,
|
|
"mcp.tool.args": str(tool_args)[:500]
|
|
if tool_args
|
|
else None, # Limit to 500 chars
|
|
},
|
|
record_exception=True,
|
|
):
|
|
try:
|
|
result = await func(*args, **kwargs)
|
|
duration = time.time() - start_time
|
|
record_tool_call(tool_name, duration, "success")
|
|
return result
|
|
except Exception as e:
|
|
duration = time.time() - start_time
|
|
record_tool_call(tool_name, duration, "error")
|
|
record_tool_error(tool_name, type(e).__name__)
|
|
raise
|
|
|
|
return wrapper
|