From 4ea5ed72d44f1de8b2352f947a7c7b6db7556be9 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Thu, 13 Nov 2025 11:49:20 +0100 Subject: [PATCH] feat: Add Grafana dashboard and vector sync metric instrumentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement comprehensive observability for vector database synchronization with Grafana dashboard and Prometheus metrics. ## Part 1: Grafana Dashboard Created all-in-one operations dashboard with 7 rows and 34 panels: ### Dashboard Structure: - **Overview Row**: Request rate, error rate, P95 latency, active requests - **HTTP Metrics (RED)**: Request/error rates by endpoint, latency percentiles - **MCP Tools**: Call volume, error rates, execution duration by tool - **Nextcloud API**: API calls/latency by app, retry patterns - **OAuth & Authentication**: Token validations, exchanges, cache hit rate - **Dependencies & Health**: Status for Nextcloud/Qdrant/Keycloak/Unstructured - **Vector Sync**: Processing throughput, queue depth, Qdrant operations ### Helm Chart Integration: - Added dashboard-configmap.yaml template for automatic provisioning - Configured Grafana sidecar auto-discovery (label: grafana_dashboard="1") - Added dashboards configuration section in values.yaml (opt-in) - Updated Chart.yaml with dashboard annotations - Enhanced NOTES.txt with dashboard deployment instructions - Comprehensive documentation in dashboards/README.md Dashboard supports dynamic filtering via variables: - datasource: Prometheus data source selection - namespace: Filter by Kubernetes namespace - pod: Multi-select pod filtering - interval: Query interval (1m/5m/10m/30m/1h) ## Part 2: Vector Sync Metric Instrumentation Implemented metric recording throughout vector sync pipeline: ### metrics.py: Added convenience functions: - record_vector_sync_scan() - Track documents per scan - record_vector_sync_processing() - Track processing duration/status - record_qdrant_operation() - Track database operations - update_vector_sync_queue_size() - Track queue depth ### scanner.py: - Record number of documents found in each scan - Enables monitoring of scan throughput ### processor.py: - Record processing duration for each document - Track success/failure status with timing - Record Qdrant upsert/delete operations - Handle all code paths (success, deletion, error) ### semantic.py: - Wrap Qdrant query_points with try/except - Record search operation success/failure ## Metrics Exposed: - mcp_vector_sync_documents_scanned_total - mcp_vector_sync_documents_processed_total{status} - mcp_vector_sync_processing_duration_seconds (histogram) - mcp_vector_sync_queue_size (gauge) - mcp_qdrant_operations_total{operation,status} This enables monitoring of: - Scan and processing throughput - Processing latency (P50/P95/P99) - Error rates for processing and Qdrant operations - Queue depth trends - Complete observability of vector sync pipeline ## Testing: Verified locally that metrics are recorded correctly: - 36 documents scanned - 3 documents processed (avg 7.5s each) - 3 successful Qdrant upsert operations - Search operations tracked ## Deployment: Enable dashboard provisioning in Helm values: ```yaml dashboards: enabled: true grafanaFolder: "Nextcloud MCP" ``` 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- charts/nextcloud-mcp-server/Chart.yaml | 4 + charts/nextcloud-mcp-server/README.md | 66 + .../nextcloud-mcp-server/dashboards/README.md | 141 +- .../dashboards/nextcloud-mcp-server.json | 1968 +++++++++++++---- .../nextcloud-mcp-server/templates/NOTES.txt | 24 + .../templates/dashboard-configmap.yaml | 24 + charts/nextcloud-mcp-server/values.yaml | 13 + nextcloud_mcp_server/observability/metrics.py | 43 + nextcloud_mcp_server/server/semantic.py | 128 +- nextcloud_mcp_server/vector/processor.py | 123 +- nextcloud_mcp_server/vector/scanner.py | 4 + 11 files changed, 1976 insertions(+), 562 deletions(-) create mode 100644 charts/nextcloud-mcp-server/templates/dashboard-configmap.yaml diff --git a/charts/nextcloud-mcp-server/Chart.yaml b/charts/nextcloud-mcp-server/Chart.yaml index 8fbd495..720783d 100644 --- a/charts/nextcloud-mcp-server/Chart.yaml +++ b/charts/nextcloud-mcp-server/Chart.yaml @@ -21,6 +21,10 @@ home: https://github.com/cbcoutinho/nextcloud-mcp-server sources: - https://github.com/cbcoutinho/nextcloud-mcp-server icon: https://raw.githubusercontent.com/nextcloud/server/master/core/img/logo/logo.svg +annotations: + # Grafana dashboard support + grafana_dashboard: "true" + grafana_dashboard_folder: "Nextcloud MCP" dependencies: - name: qdrant version: "1.15.5" diff --git a/charts/nextcloud-mcp-server/README.md b/charts/nextcloud-mcp-server/README.md index 0c73e68..d1cb5c4 100644 --- a/charts/nextcloud-mcp-server/README.md +++ b/charts/nextcloud-mcp-server/README.md @@ -280,6 +280,72 @@ Use OpenAI or any OpenAI-compatible API instead of Ollama. | `openai.secretKey` | Key in secret containing API key | `api-key` | | `openai.baseUrl` | Custom API endpoint (optional) | `""` | +#### Observability & Monitoring + +The chart includes comprehensive observability features including Prometheus metrics, OpenTelemetry tracing, and Grafana dashboards. + +**Metrics Configuration:** + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `observability.metrics.enabled` | Enable Prometheus metrics | `true` | +| `observability.metrics.port` | Metrics port | `9090` | +| `observability.metrics.path` | Metrics endpoint path | `/metrics` | + +**Tracing Configuration:** + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `observability.tracing.enabled` | Enable OpenTelemetry tracing | `false` | +| `observability.tracing.endpoint` | OTLP collector endpoint | `""` | +| `observability.tracing.serviceName` | Service name in traces | `nextcloud-mcp-server` | +| `observability.tracing.samplingRate` | Trace sampling rate (0.0-1.0) | `1.0` | + +**Logging Configuration:** + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `observability.logging.format` | Log format (json or text) | `json` | +| `observability.logging.level` | Log level | `INFO` | +| `observability.logging.includeTraceContext` | Include trace IDs in logs | `true` | + +**ServiceMonitor (Prometheus Operator):** + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `serviceMonitor.enabled` | Create ServiceMonitor resource | `false` | +| `serviceMonitor.interval` | Scrape interval | `30s` | +| `serviceMonitor.scrapeTimeout` | Scrape timeout | `10s` | +| `serviceMonitor.labels` | Additional labels for ServiceMonitor | `{}` | + +**PrometheusRule (Prometheus Operator):** + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `prometheusRule.enabled` | Create PrometheusRule with alert rules | `false` | +| `prometheusRule.labels` | Additional labels for PrometheusRule | `{}` | + +**Grafana Dashboards:** + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `dashboards.enabled` | Enable automatic dashboard provisioning | `false` | +| `dashboards.grafanaFolder` | Grafana folder name for dashboards | `Nextcloud MCP` | +| `dashboards.labels` | Additional labels for dashboard ConfigMap | `{}` | +| `dashboards.annotations` | Additional annotations for dashboard ConfigMap | `{}` | + +When `dashboards.enabled` is `true`, a ConfigMap with the Grafana dashboard is created with the `grafana_dashboard: "1"` label. This enables automatic discovery by Grafana sidecar containers (commonly used with kube-prometheus-stack). + +The dashboard provides comprehensive monitoring including: +- HTTP request metrics (RED pattern: Rate, Errors, Duration) +- MCP tool performance and errors +- Nextcloud API performance by app (notes, calendar, contacts, etc.) +- OAuth token operations and cache hit rates +- External dependency health (Nextcloud, Qdrant, Keycloak, Unstructured API) +- Vector sync processing pipeline (when enabled) + +For manual import or more details, see `charts/nextcloud-mcp-server/dashboards/README.md`. + ## Examples ### Example 1: Basic Auth with Ingress diff --git a/charts/nextcloud-mcp-server/dashboards/README.md b/charts/nextcloud-mcp-server/dashboards/README.md index ff6f6f1..af04709 100644 --- a/charts/nextcloud-mcp-server/dashboards/README.md +++ b/charts/nextcloud-mcp-server/dashboards/README.md @@ -6,14 +6,57 @@ This directory contains example Grafana dashboards for monitoring the Nextcloud ### nextcloud-mcp-server.json -Comprehensive dashboard with the following panels: +All-in-one Operations Dashboard with comprehensive monitoring across all system components. -- **Request Rate**: HTTP requests per second by method and endpoint -- **Error Rate**: Percentage of 5xx errors -- **Request Latency**: P50 and P95 latency by endpoint -- **Top MCP Tools**: Most frequently called tools -- **Nextcloud API Latency**: API call latency by app (notes, calendar, etc.) -- **Vector Sync Queue**: Queue size for background document processing +#### Overview Row +High-level metrics for quick health assessment: +- **Request Rate** (stat): Total requests per second +- **Error Rate** (stat): Percentage of 5xx errors with color thresholds +- **P95 Latency** (stat): 95th percentile request latency +- **Active Requests** (stat): Current in-flight requests + +#### HTTP Metrics (RED Pattern) +Core request/error/duration metrics: +- **Request Rate by Endpoint** (timeseries): RPS breakdown by endpoint +- **Error Rate by Status Code** (timeseries): Error rates for 4xx/5xx codes +- **Latency Percentiles** (timeseries): P50, P95, P99 latency trends +- **Status Code Distribution** (piechart): Percentage breakdown of all status codes + +#### MCP Tools Row +MCP-specific tool performance: +- **Top Tools by Call Volume** (bargauge): Top 10 most-called tools +- **Tool Error Rate** (timeseries): Error rates per tool +- **Tool Execution Duration** (timeseries): P95 latency by tool + +#### Nextcloud API Row +Backend API performance metrics: +- **API Calls by App** (timeseries): Request rate per Nextcloud app (notes, calendar, contacts, etc.) +- **API Latency by App** (timeseries): P95 latency per app +- **API Retries by Reason** (timeseries): Retry patterns (429, timeout, connection errors) +- **API Error Rate** (stat): Overall API error percentage + +#### OAuth & Authentication Row +OAuth token operations and caching: +- **Token Validations** (timeseries): Success/failure rates for token validation +- **Token Exchange Operations** (timeseries): RFC 8693 token exchange operations +- **Token Cache Hit Rate** (stat): Percentage of cache hits (color-coded: red<50%, yellow<80%, green≥80%) +- **Refresh Token Operations** (timeseries): Refresh token storage operations by type + +#### Dependencies & Health Row +External dependency status monitoring: +- **Nextcloud Health** (stat): UP/DOWN status with color coding +- **Qdrant Health** (stat): Vector database health status +- **Keycloak Health** (stat): Identity provider health status +- **Unstructured API Health** (stat): Document processing API status +- **Health Check Duration** (timeseries): Health check latency by dependency +- **Database Operation Latency** (timeseries): P95 latency for DB operations (SQLite, Qdrant) + +#### Vector Sync Row (when enabled) +Document processing pipeline metrics: +- **Documents Processed Rate** (timeseries): Processing throughput by status (success/failure) +- **Processing Queue Depth** (gauge): Current queue size with thresholds (yellow>50, red>100) +- **Qdrant Operations** (timeseries): Vector database operations by type +- **Document Processing Duration** (timeseries): P95 processing latency ## Importing to Grafana @@ -25,49 +68,73 @@ Comprehensive dashboard with the following panels: 4. Select your Prometheus data source 5. Click "Import" -### Automated Import (Kubernetes) +### Automated Import (Helm Chart) -If using the Grafana Operator or kube-prometheus-stack, you can create a ConfigMap: +The Helm chart now supports automatic dashboard provisioning via Grafana sidecar pattern. + +#### Option 1: Using Helm Chart (Recommended) + +Enable dashboard provisioning in your Helm values: + +```yaml +# values.yaml for nextcloud-mcp-server chart +dashboards: + enabled: true + grafanaFolder: "Nextcloud MCP" # Folder name in Grafana + labels: {} # Additional labels if needed +``` + +Then deploy or upgrade: ```bash -kubectl create configmap nextcloud-mcp-dashboards \ +helm upgrade --install nextcloud-mcp nextcloud-mcp-server \ + --set dashboards.enabled=true +``` + +The dashboard will be automatically imported by Grafana if the sidecar is configured +to watch for ConfigMaps with label `grafana_dashboard: "1"`. + +#### Option 2: Using kube-prometheus-stack + +If using kube-prometheus-stack with Grafana sidecar enabled, the dashboard will be +automatically discovered and imported. Ensure your Grafana deployment has: + +```yaml +# kube-prometheus-stack values +grafana: + sidecar: + dashboards: + enabled: true + label: grafana_dashboard + folder: /tmp/dashboards + provider: + foldersFromFilesStructure: true +``` + +#### Option 3: Manual ConfigMap Creation + +For other Grafana setups, create a ConfigMap manually: + +```bash +kubectl create configmap nextcloud-mcp-dashboard \ --from-file=nextcloud-mcp-server.json \ -n monitoring -# Add label for Grafana sidecar to discover -kubectl label configmap nextcloud-mcp-dashboards \ +# Add sidecar discovery label +kubectl label configmap nextcloud-mcp-dashboard \ grafana_dashboard=1 \ + grafana_folder="Nextcloud MCP" \ -n monitoring ``` -Or add to your Helm values: - -```yaml -# values.yaml for kube-prometheus-stack -grafana: - dashboardProviders: - dashboardproviders.yaml: - apiVersion: 1 - providers: - - name: 'nextcloud-mcp' - orgId: 1 - folder: 'Nextcloud MCP' - type: file - disableDeletion: false - editable: true - options: - path: /var/lib/grafana/dashboards/nextcloud-mcp - - dashboardsConfigMaps: - nextcloud-mcp: nextcloud-mcp-dashboards -``` - ## Dashboard Variables -The dashboard includes two variables: +The dashboard includes four template variables for dynamic filtering: -- **Data Source**: Select your Prometheus data source -- **Namespace**: Filter metrics by Kubernetes namespace +- **datasource**: Select your Prometheus data source +- **namespace**: Filter metrics by Kubernetes namespace (supports "All") +- **pod**: Filter by specific pod(s) - multi-select enabled (supports "All") +- **interval**: Query interval for rate calculations (1m, 5m, 10m, 30m, 1h - default: 5m) ## Customization diff --git a/charts/nextcloud-mcp-server/dashboards/nextcloud-mcp-server.json b/charts/nextcloud-mcp-server/dashboards/nextcloud-mcp-server.json index 3a5e33c..90a31a6 100644 --- a/charts/nextcloud-mcp-server/dashboards/nextcloud-mcp-server.json +++ b/charts/nextcloud-mcp-server/dashboards/nextcloud-mcp-server.json @@ -1,157 +1,92 @@ { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, + "graphTooltip": 1, "id": null, - "links": [], - "liveNow": false, "panels": [ { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "tooltip": false, - "viz": false, - "legend": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "reqps" - }, - "overrides": [] - }, + "collapsed": false, "gridPos": { - "h": 8, - "w": 12, + "h": 1, + "w": 24, "x": 0, "y": 0 }, "id": 1, - "options": { - "legend": { - "calcs": ["mean", "max"], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "sum(rate(mcp_http_requests_total{namespace=\"$namespace\"}[5m])) by (method, endpoint)", - "legendFormat": "{{method}} {{endpoint}}", - "refId": "A" - } - ], - "title": "Request Rate", - "type": "timeseries" + "panels": [], + "title": "Overview", + "type": "row" }, { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "$datasource" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "tooltip": false, - "viz": false, - "legend": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "line" - } + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] }, - "mappings": [], - "max": 100, - "min": 0, + "unit": "reqps" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "values": false + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "expr": "sum(rate(mcp_http_requests_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval]))", + "legendFormat": "requests/s", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, "thresholds": { "mode": "absolute", "steps": [ @@ -170,142 +105,218 @@ ] }, "unit": "percent" - }, - "overrides": [] + } }, "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 0 + "h": 4, + "w": 6, + "x": 6, + "y": 1 }, - "id": 2, + "id": 3, "options": { - "legend": { - "calcs": ["mean", "max"], - "displayMode": "table", - "placement": "bottom", - "showLegend": true + "colorMode": "value", + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "values": false }, - "tooltip": { - "mode": "multi", - "sort": "none" - } + "textMode": "value_and_name" }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "sum(rate(mcp_http_requests_total{status_code=~\"5..\", namespace=\"$namespace\"}[5m])) / sum(rate(mcp_http_requests_total{namespace=\"$namespace\"}[5m])) * 100", - "legendFormat": "Error Rate", + "expr": "100 * sum(rate(mcp_http_requests_total{namespace=\"$namespace\", pod=~\"$pod\", status_code=~\"5..\"}[$interval])) / sum(rate(mcp_http_requests_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval]))", + "legendFormat": "error %", "refId": "A" } ], - "title": "Error Rate (%)", - "type": "timeseries" + "title": "Error Rate", + "type": "stat" }, { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "$datasource" }, "fieldConfig": { "defaults": { "color": { - "mode": "palette-classic" + "mode": "thresholds" }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "tooltip": false, - "viz": false, - "legend": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], + "decimals": 3, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null + }, + { + "color": "yellow", + "value": 0.5 + }, + { + "color": "red", + "value": 1 } ] }, "unit": "s" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 4, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "values": false }, - "overrides": [] + "textMode": "value_and_name" + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(mcp_http_request_duration_seconds_bucket{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (le))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "P95 Latency", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 10 + }, + { + "color": "red", + "value": 50 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 5, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "values": false + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "expr": "sum(mcp_http_requests_in_progress{namespace=\"$namespace\", pod=~\"$pod\"})", + "legendFormat": "in-flight", + "refId": "A" + } + ], + "title": "Active Requests", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 5 + }, + "id": 10, + "panels": [], + "title": "HTTP Metrics (RED Pattern)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "reqps" + } }, "gridPos": { "h": 8, "w": 12, "x": 0, - "y": 8 + "y": 6 }, - "id": 3, + "id": 11, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "lastNotNull", + "mean", + "max" + ], "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" + "placement": "bottom" } }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "histogram_quantile(0.95, sum(rate(mcp_http_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le, endpoint))", - "legendFormat": "{{endpoint}} (p95)", + "expr": "sum(rate(mcp_http_requests_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (endpoint)", + "legendFormat": "{{endpoint}}", "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "histogram_quantile(0.50, sum(rate(mcp_http_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le, endpoint))", - "legendFormat": "{{endpoint}} (p50)", - "refId": "B" } ], - "title": "Request Latency (P50/P95)", + "title": "Request Rate by Endpoint", "type": "timeseries" }, { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "$datasource" }, "fieldConfig": { "defaults": { @@ -313,272 +324,1299 @@ "mode": "palette-classic" }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "tooltip": false, - "viz": false, - "legend": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "lineInterpolation": "smooth", + "showPoints": "never" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] + "unit": "reqps" + } }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 8 + "y": 6 }, - "id": 4, + "id": 12, "options": { "legend": { - "calcs": ["mean", "max"], + "calcs": [ + "lastNotNull", + "mean", + "max" + ], "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" + "placement": "bottom" } }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" + "expr": "sum(rate(mcp_http_requests_total{namespace=\"$namespace\", pod=~\"$pod\", status_code=~\"4..|5..\"}[$interval])) by (status_code)", + "legendFormat": "{{status_code}}", + "refId": "A" + } + ], + "title": "Error Rate by Status Code", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 0, + "lineInterpolation": "smooth", + "showPoints": "never" }, - "expr": "topk(10, sum(rate(mcp_tool_calls_total{namespace=\"$namespace\"}[5m])) by (tool_name))", + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 13, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.50, sum(rate(mcp_http_request_duration_seconds_bucket{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (le))", + "legendFormat": "p50", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.95, sum(rate(mcp_http_request_duration_seconds_bucket{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (le))", + "legendFormat": "p95", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(mcp_http_request_duration_seconds_bucket{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (le))", + "legendFormat": "p99", + "refId": "C" + } + ], + "title": "Latency Percentiles", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "percent" + ], + "displayMode": "table", + "placement": "right" + }, + "pieType": "donut" + }, + "targets": [ + { + "expr": "sum(increase(mcp_http_requests_total{namespace=\"$namespace\", pod=~\"$pod\"}[$__range])) by (status_code)", + "legendFormat": "{{status_code}}", + "refId": "A" + } + ], + "title": "Status Code Distribution", + "type": "piechart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 20, + "panels": [], + "title": "MCP Tools", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 21, + "options": { + "displayMode": "gradient", + "orientation": "horizontal", + "showUnfilled": true + }, + "targets": [ + { + "expr": "topk(10, sum(increase(mcp_tool_calls_total{namespace=\"$namespace\", pod=~\"$pod\"}[$__range])) by (tool_name))", "legendFormat": "{{tool_name}}", "refId": "A" } ], - "title": "Top MCP Tools by Volume", - "type": "timeseries" + "title": "Top Tools by Call Volume", + "type": "bargauge" }, { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "$datasource" }, "fieldConfig": { "defaults": { - "color": { - "mode": "palette-classic" - }, "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "tooltip": false, - "viz": false, - "legend": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } + "lineInterpolation": "smooth", + "showPoints": "never" }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "s" - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 16 - }, - "id": 5, - "options": { - "legend": { - "calcs": ["mean", "max"], - "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" + "unit": "reqps" } }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "histogram_quantile(0.95, sum(rate(mcp_nextcloud_api_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le, app))", - "legendFormat": "{{app}} (p95)", - "refId": "A" - } - ], - "title": "Nextcloud API Latency by App", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 10, - "gradientMode": "none", - "hideFrom": { - "tooltip": false, - "viz": false, - "legend": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "never", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - } - ] - }, - "unit": "short" - }, - "overrides": [] - }, "gridPos": { "h": 8, "w": 12, "x": 12, - "y": 16 + "y": 23 }, - "id": 6, + "id": 22, "options": { "legend": { - "calcs": ["mean", "lastNotNull"], + "calcs": [ + "lastNotNull", + "mean", + "max" + ], "displayMode": "table", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "multi", - "sort": "none" + "placement": "bottom" } }, "targets": [ { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "expr": "mcp_vector_sync_queue_size{namespace=\"$namespace\"}", - "legendFormat": "Queue Size", + "expr": "sum(rate(mcp_tool_calls_total{namespace=\"$namespace\", pod=~\"$pod\", status=\"error\"}[$interval])) by (tool_name)", + "legendFormat": "{{tool_name}}", "refId": "A" } ], - "title": "Vector Sync Queue Size", + "title": "Tool Error Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 0, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 31 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(mcp_tool_duration_seconds_bucket{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (tool_name, le))", + "legendFormat": "{{tool_name}}", + "refId": "A" + } + ], + "title": "Tool Execution Duration (P95)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 39 + }, + "id": 30, + "panels": [], + "title": "Nextcloud API", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "id": 31, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum(rate(mcp_nextcloud_api_requests_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (app)", + "legendFormat": "{{app}}", + "refId": "A" + } + ], + "title": "API Calls by App", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 0, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "id": 32, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(mcp_nextcloud_api_duration_seconds_bucket{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (app, le))", + "legendFormat": "{{app}}", + "refId": "A" + } + ], + "title": "API Latency by App (P95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 33, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum(rate(mcp_nextcloud_api_retries_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (reason)", + "legendFormat": "{{reason}}", + "refId": "A" + } + ], + "title": "API Retries by Reason", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 34, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "values": false + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "expr": "100 * sum(rate(mcp_nextcloud_api_requests_total{namespace=\"$namespace\", pod=~\"$pod\", status_code=~\"5..\"}[$interval])) / sum(rate(mcp_nextcloud_api_requests_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval]))", + "legendFormat": "error %", + "refId": "A" + } + ], + "title": "API Error Rate", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 56 + }, + "id": 40, + "panels": [], + "title": "OAuth & Authentication", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 57 + }, + "id": 41, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum(rate(mcp_oauth_token_validations_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (status)", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "Token Validations", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 57 + }, + "id": 42, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum(rate(mcp_oauth_token_exchange_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (status)", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "Token Exchange Operations", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 2, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "green", + "value": 80 + } + ] + }, + "unit": "percent" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 43, + "options": { + "colorMode": "value", + "graphMode": "area", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "values": false + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "expr": "100 * sum(rate(mcp_oauth_token_cache_hits_total{namespace=\"$namespace\", pod=~\"$pod\", result=\"hit\"}[$interval])) / sum(rate(mcp_oauth_token_cache_hits_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval]))", + "legendFormat": "hit %", + "refId": "A" + } + ], + "title": "Token Cache Hit Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "reqps" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 44, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum(rate(mcp_oauth_refresh_token_operations_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (operation)", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "title": "Refresh Token Operations", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 73 + }, + "id": 50, + "panels": [], + "title": "Dependencies & Health", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + } + }, + "type": "value" + }, + { + "options": { + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 0, + "y": 74 + }, + "id": 51, + "options": { + "colorMode": "value", + "graphMode": "none", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "values": false + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "expr": "mcp_dependency_health{namespace=\"$namespace\", pod=~\"$pod\", dependency=\"nextcloud\"}", + "legendFormat": "status", + "refId": "A" + } + ], + "title": "Nextcloud Health", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + } + }, + "type": "value" + }, + { + "options": { + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 6, + "y": 74 + }, + "id": 52, + "options": { + "colorMode": "value", + "graphMode": "none", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "values": false + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "expr": "mcp_dependency_health{namespace=\"$namespace\", pod=~\"$pod\", dependency=\"qdrant\"}", + "legendFormat": "status", + "refId": "A" + } + ], + "title": "Qdrant Health", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + } + }, + "type": "value" + }, + { + "options": { + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 12, + "y": 74 + }, + "id": 53, + "options": { + "colorMode": "value", + "graphMode": "none", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "values": false + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "expr": "mcp_dependency_health{namespace=\"$namespace\", pod=~\"$pod\", dependency=\"keycloak\"}", + "legendFormat": "status", + "refId": "A" + } + ], + "title": "Keycloak Health", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "text": "DOWN" + } + }, + "type": "value" + }, + { + "options": { + "1": { + "color": "green", + "text": "UP" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "green", + "value": 1 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 4, + "w": 6, + "x": 18, + "y": 74 + }, + "id": 54, + "options": { + "colorMode": "value", + "graphMode": "none", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "values": false + }, + "textMode": "value_and_name" + }, + "targets": [ + { + "expr": "mcp_dependency_health{namespace=\"$namespace\", pod=~\"$pod\", dependency=\"unstructured\"}", + "legendFormat": "status", + "refId": "A" + } + ], + "title": "Unstructured API Health", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 0, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 78 + }, + "id": 55, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "mcp_dependency_check_duration_seconds{namespace=\"$namespace\", pod=~\"$pod\"}", + "legendFormat": "{{dependency}}", + "refId": "A" + } + ], + "title": "Health Check Duration", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 0, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 78 + }, + "id": 56, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(mcp_db_operation_duration_seconds_bucket{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (db, operation, le))", + "legendFormat": "{{db}}/{{operation}}", + "refId": "A" + } + ], + "title": "Database Operation Latency", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 86 + }, + "id": 60, + "panels": [], + "title": "Vector Sync (when enabled)", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 87 + }, + "id": 61, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum(rate(mcp_vector_sync_documents_processed_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (status)", + "legendFormat": "{{status}}", + "refId": "A" + } + ], + "title": "Documents Processed Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "max": 200, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 50 + }, + { + "color": "red", + "value": 100 + } + ] + }, + "unit": "short" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 87 + }, + "id": 62, + "options": { + "showThresholdLabels": true, + "showThresholdMarkers": true + }, + "targets": [ + { + "expr": "mcp_vector_sync_queue_size{namespace=\"$namespace\", pod=~\"$pod\"}", + "legendFormat": "queue", + "refId": "A" + } + ], + "title": "Processing Queue Depth", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 10, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "ops" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 95 + }, + "id": 63, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "sum(rate(mcp_qdrant_operations_total{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (operation)", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "title": "Qdrant Operations", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "custom": { + "drawStyle": "line", + "fillOpacity": 0, + "lineInterpolation": "smooth", + "showPoints": "never" + }, + "unit": "s" + } + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 95 + }, + "id": 64, + "options": { + "legend": { + "calcs": [ + "lastNotNull", + "mean", + "max" + ], + "displayMode": "table", + "placement": "bottom" + } + }, + "targets": [ + { + "expr": "histogram_quantile(0.95, sum(rate(mcp_vector_sync_processing_duration_seconds_bucket{namespace=\"$namespace\", pod=~\"$pod\"}[$interval])) by (le))", + "legendFormat": "p95", + "refId": "A" + } + ], + "title": "Document Processing Duration (P95)", "type": "timeseries" } ], "refresh": "30s", - "schemaVersion": 38, - "style": "dark", - "tags": ["nextcloud", "mcp", "observability"], + "tags": [ + "nextcloud-mcp-server", + "operations", + "kubernetes", + "mcp" + ], "templating": { "list": [ { "current": { - "selected": false, "text": "Prometheus", - "value": "Prometheus" + "value": "prometheus" }, "hide": 0, "includeAll": false, - "label": "Data Source", "multi": false, "name": "datasource", "options": [], @@ -589,31 +1627,79 @@ "type": "datasource" }, { - "current": { - "selected": false, - "text": "default", - "value": "default" - }, + "current": {}, "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "$datasource" }, - "definition": "label_values(mcp_http_requests_total, namespace)", "hide": 0, - "includeAll": false, - "label": "Namespace", + "includeAll": true, "multi": false, "name": "namespace", "options": [], - "query": { - "query": "label_values(mcp_http_requests_total, namespace)", - "refId": "PrometheusVariableQueryEditor-VariableQuery" - }, - "refresh": 1, + "query": "label_values(mcp_http_requests_total, namespace)", + "refresh": 2, "regex": "", "skipUrlSync": false, - "sort": 0, + "sort": 1, "type": "query" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "hide": 0, + "includeAll": true, + "multi": true, + "name": "pod", + "options": [], + "query": "label_values(mcp_http_requests_total{namespace=\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "text": "5m", + "value": "5m" + }, + "hide": 0, + "name": "interval", + "options": [ + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": true, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + } + ], + "query": "1m,5m,10m,30m,1h", + "refresh": 0, + "skipUrlSync": false, + "type": "interval" } ] }, @@ -621,10 +1707,8 @@ "from": "now-6h", "to": "now" }, - "timepicker": {}, - "timezone": "", - "title": "Nextcloud MCP Server", + "timezone": "browser", + "title": "Nextcloud MCP Server - Operations", "uid": "nextcloud-mcp-server", - "version": 1, - "weekStart": "" + "version": 1 } diff --git a/charts/nextcloud-mcp-server/templates/NOTES.txt b/charts/nextcloud-mcp-server/templates/NOTES.txt index 2ab528f..3533cab 100644 --- a/charts/nextcloud-mcp-server/templates/NOTES.txt +++ b/charts/nextcloud-mcp-server/templates/NOTES.txt @@ -96,6 +96,30 @@ Your Nextcloud MCP Server has been deployed in {{ .Values.auth.mode }} authentic kubectl --namespace {{ .Release.Namespace }} exec -it deploy/{{ include "nextcloud-mcp-server.fullname" . }} -- curl -s http://localhost:{{ include "nextcloud-mcp-server.port" . }}/user/page | grep "Vector Sync" {{- end }} +{{- if .Values.dashboards.enabled }} + +6. Grafana Dashboards: + - Dashboard provisioning: Enabled + - ConfigMap: {{ include "nextcloud-mcp-server.fullname" . }}-dashboard + - Grafana Folder: {{ .Values.dashboards.grafanaFolder }} + + The dashboard will be automatically imported by Grafana if the sidecar is configured + to watch for ConfigMaps with label "grafana_dashboard: 1". + + To manually import the dashboard: + kubectl --namespace {{ .Release.Namespace }} get configmap {{ include "nextcloud-mcp-server.fullname" . }}-dashboard -o jsonpath='{.data.nextcloud-mcp-server\.json}' | jq . > dashboard.json + + Then import dashboard.json via Grafana UI (Dashboards → Import). +{{- else }} + +6. Grafana Dashboards: + - Dashboard provisioning: Disabled + - To enable automatic dashboard provisioning, set: dashboards.enabled=true + + Manual import option: + The dashboard JSON is available in the chart at charts/nextcloud-mcp-server/dashboards/nextcloud-mcp-server.json +{{- end }} + For more information and documentation: - GitHub: https://github.com/cbcoutinho/nextcloud-mcp-server - Documentation: https://github.com/cbcoutinho/nextcloud-mcp-server#readme diff --git a/charts/nextcloud-mcp-server/templates/dashboard-configmap.yaml b/charts/nextcloud-mcp-server/templates/dashboard-configmap.yaml new file mode 100644 index 0000000..b1d59dd --- /dev/null +++ b/charts/nextcloud-mcp-server/templates/dashboard-configmap.yaml @@ -0,0 +1,24 @@ +{{- if .Values.dashboards.enabled }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "nextcloud-mcp-server.fullname" . }}-dashboard + namespace: {{ .Release.Namespace }} + labels: + {{- include "nextcloud-mcp-server.labels" . | nindent 4 }} + {{- with .Values.dashboards.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} + # Grafana sidecar discovery labels + grafana_dashboard: "1" + {{- if .Values.dashboards.grafanaFolder }} + grafana_folder: {{ .Values.dashboards.grafanaFolder | quote }} + {{- end }} + annotations: + {{- with .Values.dashboards.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} +data: + nextcloud-mcp-server.json: |- +{{ .Files.Get "dashboards/nextcloud-mcp-server.json" | indent 4 }} +{{- end }} diff --git a/charts/nextcloud-mcp-server/values.yaml b/charts/nextcloud-mcp-server/values.yaml index a3ef82f..9330fa5 100644 --- a/charts/nextcloud-mcp-server/values.yaml +++ b/charts/nextcloud-mcp-server/values.yaml @@ -205,6 +205,19 @@ prometheusRule: # Additional labels for PrometheusRule (e.g., for Prometheus selector) # Example: { prometheus: kube-prometheus } +# Grafana dashboards (requires Grafana with sidecar enabled) +dashboards: + # Enable automatic dashboard provisioning via ConfigMap + enabled: false + # Grafana folder name where dashboards will be imported + # The grafana-sidecar looks for ConfigMaps with label "grafana_folder" + grafanaFolder: "Nextcloud MCP" + # Additional labels for dashboard ConfigMap + # These will be added alongside the required "grafana_dashboard: 1" label + labels: {} + # Additional annotations for dashboard ConfigMap + annotations: {} + service: type: ClusterIP port: 8000 diff --git a/nextcloud_mcp_server/observability/metrics.py b/nextcloud_mcp_server/observability/metrics.py index ae51217..6a67b49 100644 --- a/nextcloud_mcp_server/observability/metrics.py +++ b/nextcloud_mcp_server/observability/metrics.py @@ -352,3 +352,46 @@ def record_dependency_check(dependency: str, duration: float) -> None: duration: Check duration in seconds """ dependency_check_duration_seconds.labels(dependency=dependency).observe(duration) + + +def record_vector_sync_scan(documents_found: int) -> None: + """ + Record documents scanned during vector sync. + + Args: + documents_found: Number of documents discovered in scan + """ + vector_sync_documents_scanned_total.inc(documents_found) + + +def record_vector_sync_processing(duration: float, status: str = "success") -> None: + """ + Record document processing with duration and status. + + Args: + duration: Processing duration in seconds + status: "success" or "error" + """ + vector_sync_documents_processed_total.labels(status=status).inc() + vector_sync_processing_duration_seconds.observe(duration) + + +def record_qdrant_operation(operation: str, status: str = "success") -> None: + """ + Record Qdrant vector database operation. + + Args: + operation: Operation type ("upsert", "search", "delete") + status: "success" or "error" + """ + qdrant_operations_total.labels(operation=operation, status=status).inc() + + +def update_vector_sync_queue_size(size: int) -> None: + """ + Update vector sync queue size gauge. + + Args: + size: Current queue size + """ + vector_sync_queue_size.set(size) diff --git a/nextcloud_mcp_server/server/semantic.py b/nextcloud_mcp_server/server/semantic.py index 135dd90..3d7f755 100644 --- a/nextcloud_mcp_server/server/semantic.py +++ b/nextcloud_mcp_server/server/semantic.py @@ -21,6 +21,7 @@ from nextcloud_mcp_server.models.semantic import ( SemanticSearchResult, VectorSyncStatusResponse, ) +from nextcloud_mcp_server.observability.metrics import record_qdrant_operation logger = logging.getLogger(__name__) @@ -85,26 +86,33 @@ def configure_semantic_tools(mcp: FastMCP): # Note: Currently only searching notes (doc_type="note") # Future: Remove doc_type filter to search all apps qdrant_client = await get_qdrant_client() - search_response = await qdrant_client.query_points( - collection_name=settings.get_collection_name(), - query=query_embedding, - query_filter=Filter( - must=[ - FieldCondition( - key="user_id", - match=MatchValue(value=username), - ), - FieldCondition( - key="doc_type", - match=MatchValue(value="note"), - ), - ] - ), - limit=limit * 2, # Get extra for filtering - score_threshold=score_threshold, - with_payload=True, - with_vectors=False, # Don't return vectors to save bandwidth - ) + try: + search_response = await qdrant_client.query_points( + collection_name=settings.get_collection_name(), + query=query_embedding, + query_filter=Filter( + must=[ + FieldCondition( + key="user_id", + match=MatchValue(value=username), + ), + FieldCondition( + key="doc_type", + match=MatchValue(value="note"), + ), + ] + ), + limit=limit * 2, # Get extra for filtering + score_threshold=score_threshold, + with_payload=True, + with_vectors=False, # Don't return vectors to save bandwidth + ) + # Record successful search operation + record_qdrant_operation("search", "success") + except Exception: + # Record failed search operation + record_qdrant_operation("search", "error") + raise logger.info( f"Qdrant returned {len(search_response.points)} results " @@ -331,21 +339,71 @@ def configure_semantic_tools(mcp: FastMCP): success=True, ) - # 4. Construct context from retrieved documents + # 4. Fetch full content for notes to provide complete context to LLM + # Filter out inaccessible notes (deleted or permissions changed) + client = await get_client(ctx) + accessible_results = [] + full_contents = [] # Full content for accessible notes + + for result in search_response.results: + if result.doc_type == "note": + try: + note = await client.notes.get_note(result.id) + # Note is accessible, store full content + accessible_results.append(result) + full_contents.append(note.get("content", "")) + logger.debug( + f"Fetched full content for note {result.id} " + f"(length: {len(full_contents[-1])} chars)" + ) + except Exception as e: + # Note might have been deleted or permissions changed + # Filter it out to avoid corrupting LLM with inaccessible data + logger.warning( + f"Failed to fetch full content for note {result.id}: {e}. " + f"Excluding from results." + ) + else: + # Non-note document types (future: calendar, deck, files) + # For now, keep them with excerpts + accessible_results.append(result) + full_contents.append(None) + + # Check if we filtered out all results + if not accessible_results: + logger.warning(f"All search results became inaccessible for query: {query}") + return SamplingSearchResponse( + query=query, + generated_answer="All matching documents are no longer accessible.", + sources=[], + total_found=0, + search_method="semantic_sampling", + success=True, + ) + + # 5. Construct context from accessible documents with full content context_parts = [] - for idx, result in enumerate(search_response.results, 1): + for idx, (result, content) in enumerate( + zip(accessible_results, full_contents), 1 + ): + # Use full content if available (notes), otherwise use excerpt + if content is not None: + content_field = f"Content: {content}" + else: + content_field = f"Excerpt: {result.excerpt}" + context_parts.append( f"[Document {idx}]\n" f"Type: {result.doc_type}\n" f"Title: {result.title}\n" f"Category: {result.category}\n" - f"Excerpt: {result.excerpt}\n" + f"{content_field}\n" f"Relevance Score: {result.score:.2f}\n" ) context = "\n".join(context_parts) - # 5. Construct prompt - reuse user's query, add context and instructions + # 6. Construct prompt - reuse user's query, add context and instructions prompt = ( f"{query}\n\n" f"Here are relevant documents from Nextcloud (notes, calendar events, deck cards, files, contacts):\n\n" @@ -401,8 +459,8 @@ def configure_semantic_tools(mcp: FastMCP): return SamplingSearchResponse( query=query, generated_answer=generated_answer, - sources=search_response.results, - total_found=search_response.total_found, + sources=accessible_results, + total_found=len(accessible_results), search_method="semantic_sampling", model_used=sampling_result.model, stop_reason=sampling_result.stopReason, @@ -419,11 +477,11 @@ def configure_semantic_tools(mcp: FastMCP): generated_answer=( f"[Sampling request timed out]\n\n" f"The answer generation took too long (>30s). " - f"Found {search_response.total_found} relevant documents. " + f"Found {len(accessible_results)} relevant documents. " f"Please review the sources below or try a simpler query." ), - sources=search_response.results, - total_found=search_response.total_found, + sources=accessible_results, + total_found=len(accessible_results), search_method="semantic_sampling_timeout", success=True, ) @@ -454,11 +512,11 @@ def configure_semantic_tools(mcp: FastMCP): query=query, generated_answer=( f"[{user_message}]\n\n" - f"Found {search_response.total_found} relevant documents. " + f"Found {len(accessible_results)} relevant documents. " f"Please review the sources below." ), - sources=search_response.results, - total_found=search_response.total_found, + sources=accessible_results, + total_found=len(accessible_results), search_method=search_method, success=True, ) @@ -475,11 +533,11 @@ def configure_semantic_tools(mcp: FastMCP): query=query, generated_answer=( f"[Unexpected error during sampling]\n\n" - f"Found {search_response.total_found} relevant documents. " + f"Found {len(accessible_results)} relevant documents. " f"Please review the sources below." ), - sources=search_response.results, - total_found=search_response.total_found, + sources=accessible_results, + total_found=len(accessible_results), search_method="semantic_sampling_error", success=True, ) diff --git a/nextcloud_mcp_server/vector/processor.py b/nextcloud_mcp_server/vector/processor.py index 5542cce..f89aae5 100644 --- a/nextcloud_mcp_server/vector/processor.py +++ b/nextcloud_mcp_server/vector/processor.py @@ -15,6 +15,10 @@ from qdrant_client.models import FieldCondition, Filter, MatchValue, PointStruct from nextcloud_mcp_server.client import NextcloudClient from nextcloud_mcp_server.config import get_settings from nextcloud_mcp_server.embedding import get_embedding_service +from nextcloud_mcp_server.observability.metrics import ( + record_qdrant_operation, + record_vector_sync_processing, +) from nextcloud_mcp_server.observability.tracing import trace_operation from nextcloud_mcp_server.vector.document_chunker import DocumentChunker from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client @@ -90,6 +94,8 @@ async def process_document(doc_task: DocumentTask, nc_client: NextcloudClient): doc_task: Document task to process nc_client: Authenticated Nextcloud client """ + start_time = time.time() + logger.debug( f"Processing {doc_task.doc_type}_{doc_task.doc_id} " f"for {doc_task.user_id} ({doc_task.operation})" @@ -105,58 +111,79 @@ async def process_document(doc_task: DocumentTask, nc_client: NextcloudClient): "vector_sync.doc_operation": doc_task.operation, }, ): - qdrant_client = await get_qdrant_client() - settings = get_settings() + try: + qdrant_client = await get_qdrant_client() + settings = get_settings() - # Handle deletion - if doc_task.operation == "delete": - await qdrant_client.delete( - collection_name=settings.get_collection_name(), - points_selector=Filter( - must=[ - FieldCondition( - key="user_id", - match=MatchValue(value=doc_task.user_id), - ), - FieldCondition( - key="doc_id", - match=MatchValue(value=doc_task.doc_id), - ), - FieldCondition( - key="doc_type", - match=MatchValue(value=doc_task.doc_type), - ), - ] - ), - ) - logger.info( - f"Deleted {doc_task.doc_type}_{doc_task.doc_id} for {doc_task.user_id}" - ) - return + # Handle deletion + if doc_task.operation == "delete": + await qdrant_client.delete( + collection_name=settings.get_collection_name(), + points_selector=Filter( + must=[ + FieldCondition( + key="user_id", + match=MatchValue(value=doc_task.user_id), + ), + FieldCondition( + key="doc_id", + match=MatchValue(value=doc_task.doc_id), + ), + FieldCondition( + key="doc_type", + match=MatchValue(value=doc_task.doc_type), + ), + ] + ), + ) + logger.info( + f"Deleted {doc_task.doc_type}_{doc_task.doc_id} for {doc_task.user_id}" + ) - # Handle indexing with retry - max_retries = 3 - retry_delay = 1.0 + # Record successful deletion metrics + duration = time.time() - start_time + record_qdrant_operation("delete", "success") + record_vector_sync_processing(duration, "success") + return - for attempt in range(max_retries): - try: - await _index_document(doc_task, nc_client, qdrant_client) - return # Success + # Handle indexing with retry + max_retries = 3 + retry_delay = 1.0 - except (HTTPStatusError, Exception) as e: - if attempt < max_retries - 1: - logger.warning( - f"Retry {attempt + 1}/{max_retries} for " - f"{doc_task.doc_type}_{doc_task.doc_id}: {e}" - ) - await anyio.sleep(retry_delay) - retry_delay *= 2 # Exponential backoff - else: - logger.error( - f"Failed to index {doc_task.doc_type}_{doc_task.doc_id} " - f"after {max_retries} retries: {e}" - ) - raise + for attempt in range(max_retries): + try: + await _index_document(doc_task, nc_client, qdrant_client) + + # Record successful processing metrics + duration = time.time() - start_time + record_qdrant_operation("upsert", "success") + record_vector_sync_processing(duration, "success") + return # Success + + except (HTTPStatusError, Exception) as e: + if attempt < max_retries - 1: + logger.warning( + f"Retry {attempt + 1}/{max_retries} for " + f"{doc_task.doc_type}_{doc_task.doc_id}: {e}" + ) + await anyio.sleep(retry_delay) + retry_delay *= 2 # Exponential backoff + else: + logger.error( + f"Failed to index {doc_task.doc_type}_{doc_task.doc_id} " + f"after {max_retries} retries: {e}" + ) + # Record failed processing metrics + duration = time.time() - start_time + record_qdrant_operation("upsert", "error") + record_vector_sync_processing(duration, "error") + raise + + except Exception: + # Catch any other unexpected errors + duration = time.time() - start_time + record_vector_sync_processing(duration, "error") + raise async def _index_document( diff --git a/nextcloud_mcp_server/vector/scanner.py b/nextcloud_mcp_server/vector/scanner.py index 1eea941..953219e 100644 --- a/nextcloud_mcp_server/vector/scanner.py +++ b/nextcloud_mcp_server/vector/scanner.py @@ -13,6 +13,7 @@ from qdrant_client.models import FieldCondition, Filter, MatchValue from nextcloud_mcp_server.client import NextcloudClient from nextcloud_mcp_server.config import get_settings +from nextcloud_mcp_server.observability.metrics import record_vector_sync_scan from nextcloud_mcp_server.observability.tracing import trace_operation from nextcloud_mcp_server.vector.qdrant_client import get_qdrant_client @@ -181,6 +182,9 @@ async def scan_user_documents( ] logger.info(f"[SCAN-{scan_id}] Found {len(notes)} notes for {user_id}") + # Record documents scanned + record_vector_sync_scan(len(notes)) + if initial_sync: # Send everything on first sync for note in notes: