From 093ac5b5bae4b67c244da817df87b23f7c5d5f29 Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sun, 9 Nov 2025 09:10:11 +0100 Subject: [PATCH 1/2] feat(helm): Add observability support with ServiceMonitor and Grafana dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add comprehensive observability configuration to Helm chart: **Helm Values:** - Add observability configuration section for metrics, tracing, and logging - Add serviceMonitor configuration (disabled by default) - Add prometheusRule configuration (disabled by default) **Templates:** - Update deployment to include observability environment variables - Update deployment to expose metrics port (9090) - Update service to expose metrics port - Add ServiceMonitor template for Prometheus Operator - Add PrometheusRule template with critical and warning alerts **Dashboards:** - Add comprehensive Grafana dashboard JSON with 6 panels: - Request Rate (by method and endpoint) - Error Rate (5xx errors percentage) - Request Latency (P50/P95 by endpoint) - Top MCP Tools (by invocation volume) - Nextcloud API Latency (by app) - Vector Sync Queue Size - Add dashboard README with import instructions **Alert Rules:** - Critical: Server down, high error rate (>5%), high latency (>1s), dependency down - Warning: Token validation errors (>1%), vector sync queue high (>100), Qdrant slow (>500ms) All features are opt-in via values.yaml configuration. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../nextcloud-mcp-server/dashboards/README.md | 90 +++ .../dashboards/nextcloud-mcp-server.json | 630 ++++++++++++++++++ .../templates/deployment.yaml | 26 + .../templates/prometheusrule.yaml | 92 +++ .../templates/service.yaml | 6 + .../templates/servicemonitor.yaml | 32 + charts/nextcloud-mcp-server/values.yaml | 37 + 7 files changed, 913 insertions(+) create mode 100644 charts/nextcloud-mcp-server/dashboards/README.md create mode 100644 charts/nextcloud-mcp-server/dashboards/nextcloud-mcp-server.json create mode 100644 charts/nextcloud-mcp-server/templates/prometheusrule.yaml create mode 100644 charts/nextcloud-mcp-server/templates/servicemonitor.yaml diff --git a/charts/nextcloud-mcp-server/dashboards/README.md b/charts/nextcloud-mcp-server/dashboards/README.md new file mode 100644 index 0000000..ff6f6f1 --- /dev/null +++ b/charts/nextcloud-mcp-server/dashboards/README.md @@ -0,0 +1,90 @@ +# Grafana Dashboards + +This directory contains example Grafana dashboards for monitoring the Nextcloud MCP Server. + +## Dashboards + +### nextcloud-mcp-server.json + +Comprehensive dashboard with the following panels: + +- **Request Rate**: HTTP requests per second by method and endpoint +- **Error Rate**: Percentage of 5xx errors +- **Request Latency**: P50 and P95 latency by endpoint +- **Top MCP Tools**: Most frequently called tools +- **Nextcloud API Latency**: API call latency by app (notes, calendar, etc.) +- **Vector Sync Queue**: Queue size for background document processing + +## Importing to Grafana + +### Manual Import + +1. Open Grafana UI +2. Navigate to Dashboards → Import +3. Upload `nextcloud-mcp-server.json` +4. Select your Prometheus data source +5. Click "Import" + +### Automated Import (Kubernetes) + +If using the Grafana Operator or kube-prometheus-stack, you can create a ConfigMap: + +```bash +kubectl create configmap nextcloud-mcp-dashboards \ + --from-file=nextcloud-mcp-server.json \ + -n monitoring + +# Add label for Grafana sidecar to discover +kubectl label configmap nextcloud-mcp-dashboards \ + grafana_dashboard=1 \ + -n monitoring +``` + +Or add to your Helm values: + +```yaml +# values.yaml for kube-prometheus-stack +grafana: + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: 'nextcloud-mcp' + orgId: 1 + folder: 'Nextcloud MCP' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/nextcloud-mcp + + dashboardsConfigMaps: + nextcloud-mcp: nextcloud-mcp-dashboards +``` + +## Dashboard Variables + +The dashboard includes two variables: + +- **Data Source**: Select your Prometheus data source +- **Namespace**: Filter metrics by Kubernetes namespace + +## Customization + +You can customize the dashboard by: + +1. Adjusting refresh rate (default: 30s) +2. Modifying time range (default: last 6 hours) +3. Adding new panels for specific metrics +4. Adjusting thresholds in existing panels + +## Metrics Reference + +All metrics are documented in `/docs/observability.md`. Key metric prefixes: + +- `mcp_http_*` - HTTP server metrics +- `mcp_tool_*` - MCP tool invocation metrics +- `mcp_nextcloud_api_*` - Nextcloud API call metrics +- `mcp_oauth_*` - OAuth token validation metrics +- `mcp_vector_sync_*` - Vector database sync metrics +- `mcp_db_*` - Database operation metrics diff --git a/charts/nextcloud-mcp-server/dashboards/nextcloud-mcp-server.json b/charts/nextcloud-mcp-server/dashboards/nextcloud-mcp-server.json new file mode 100644 index 0000000..3a5e33c --- /dev/null +++ b/charts/nextcloud-mcp-server/dashboards/nextcloud-mcp-server.json @@ -0,0 +1,630 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(mcp_http_requests_total{namespace=\"$namespace\"}[5m])) by (method, endpoint)", + "legendFormat": "{{method}} {{endpoint}}", + "refId": "A" + } + ], + "title": "Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "line" + } + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 1 + }, + { + "color": "red", + "value": 5 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "sum(rate(mcp_http_requests_total{status_code=~\"5..\", namespace=\"$namespace\"}[5m])) / sum(rate(mcp_http_requests_total{namespace=\"$namespace\"}[5m])) * 100", + "legendFormat": "Error Rate", + "refId": "A" + } + ], + "title": "Error Rate (%)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 3, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum(rate(mcp_http_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le, endpoint))", + "legendFormat": "{{endpoint}} (p95)", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.50, sum(rate(mcp_http_request_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le, endpoint))", + "legendFormat": "{{endpoint}} (p50)", + "refId": "B" + } + ], + "title": "Request Latency (P50/P95)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "topk(10, sum(rate(mcp_tool_calls_total{namespace=\"$namespace\"}[5m])) by (tool_name))", + "legendFormat": "{{tool_name}}", + "refId": "A" + } + ], + "title": "Top MCP Tools by Volume", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 5, + "options": { + "legend": { + "calcs": ["mean", "max"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "histogram_quantile(0.95, sum(rate(mcp_nextcloud_api_duration_seconds_bucket{namespace=\"$namespace\"}[5m])) by (le, app))", + "legendFormat": "{{app}} (p95)", + "refId": "A" + } + ], + "title": "Nextcloud API Latency by App", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "tooltip": false, + "viz": false, + "legend": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 6, + "options": { + "legend": { + "calcs": ["mean", "lastNotNull"], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "expr": "mcp_vector_sync_queue_size{namespace=\"$namespace\"}", + "legendFormat": "Queue Size", + "refId": "A" + } + ], + "title": "Vector Sync Queue Size", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": ["nextcloud", "mcp", "observability"], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Data Source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(mcp_http_requests_total, namespace)", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(mcp_http_requests_total, namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Nextcloud MCP Server", + "uid": "nextcloud-mcp-server", + "version": 1, + "weekStart": "" +} diff --git a/charts/nextcloud-mcp-server/templates/deployment.yaml b/charts/nextcloud-mcp-server/templates/deployment.yaml index 08c14fc..bf969ae 100644 --- a/charts/nextcloud-mcp-server/templates/deployment.yaml +++ b/charts/nextcloud-mcp-server/templates/deployment.yaml @@ -56,6 +56,11 @@ spec: - name: http containerPort: {{ include "nextcloud-mcp-server.port" . }} protocol: TCP + {{- if .Values.observability.metrics.enabled }} + - name: metrics + containerPort: {{ .Values.observability.metrics.port }} + protocol: TCP + {{- end }} env: # Nextcloud connection - name: NEXTCLOUD_HOST @@ -200,6 +205,27 @@ spec: value: {{ .Values.openai.baseUrl | quote }} {{- end }} {{- end }} + # Observability + - name: METRICS_ENABLED + value: {{ .Values.observability.metrics.enabled | quote }} + - name: METRICS_PORT + value: {{ .Values.observability.metrics.port | quote }} + {{- if .Values.observability.tracing.enabled }} + - name: OTEL_ENABLED + value: "true" + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: {{ .Values.observability.tracing.endpoint | quote }} + - name: OTEL_SERVICE_NAME + value: {{ .Values.observability.tracing.serviceName | quote }} + - name: OTEL_TRACES_SAMPLER_ARG + value: {{ .Values.observability.tracing.samplingRate | quote }} + {{- end }} + - name: LOG_FORMAT + value: {{ .Values.observability.logging.format | quote }} + - name: LOG_LEVEL + value: {{ .Values.observability.logging.level | quote }} + - name: LOG_INCLUDE_TRACE_CONTEXT + value: {{ .Values.observability.logging.includeTraceContext | quote }} {{- with .Values.extraEnv }} {{- toYaml . | nindent 12 }} {{- end }} diff --git a/charts/nextcloud-mcp-server/templates/prometheusrule.yaml b/charts/nextcloud-mcp-server/templates/prometheusrule.yaml new file mode 100644 index 0000000..204d127 --- /dev/null +++ b/charts/nextcloud-mcp-server/templates/prometheusrule.yaml @@ -0,0 +1,92 @@ +{{- if and .Values.observability.metrics.enabled .Values.prometheusRule.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "nextcloud-mcp-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "nextcloud-mcp-server.labels" . | nindent 4 }} + {{- with .Values.prometheusRule.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + groups: + - name: nextcloud-mcp-server.critical + interval: 30s + rules: + - alert: NextcloudMCPServerDown + expr: up{job="{{ include "nextcloud-mcp-server.fullname" . }}"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Nextcloud MCP Server is down" + description: "{{ `{{` }} $labels.pod {{ `}}` }} has been down for more than 5 minutes." + + - alert: NextcloudMCPHighErrorRate + expr: | + sum(rate(mcp_http_requests_total{status_code=~"5..", job="{{ include "nextcloud-mcp-server.fullname" . }}"}[5m])) + / sum(rate(mcp_http_requests_total{job="{{ include "nextcloud-mcp-server.fullname" . }}"}[5m])) > 0.05 + for: 5m + labels: + severity: critical + annotations: + summary: "High error rate on Nextcloud MCP Server" + description: "Error rate is {{ `{{` }} printf \"%.2f%%\" (mul $value 100) {{ `}}` }} (threshold: 5%)" + + - alert: NextcloudMCPHighLatency + expr: | + histogram_quantile(0.95, + sum(rate(mcp_http_request_duration_seconds_bucket{job="{{ include "nextcloud-mcp-server.fullname" . }}"}[5m])) by (le, endpoint) + ) > 1 + for: 5m + labels: + severity: critical + annotations: + summary: "High latency on Nextcloud MCP Server" + description: "P95 latency is {{ `{{` }} printf \"%.2fs\" $value {{ `}}` }} on {{ `{{` }} $labels.endpoint {{ `}}` }} (threshold: 1s)" + + - alert: NextcloudMCPDependencyDown + expr: mcp_dependency_health{job="{{ include "nextcloud-mcp-server.fullname" . }}"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "Nextcloud MCP dependency is down" + description: "Dependency {{ `{{` }} $labels.dependency {{ `}}` }} has been down for more than 2 minutes." + + - name: nextcloud-mcp-server.warning + interval: 30s + rules: + - alert: NextcloudMCPTokenValidationErrors + expr: | + sum(rate(mcp_oauth_token_validations_total{result="error", job="{{ include "nextcloud-mcp-server.fullname" . }}"}[10m])) + / sum(rate(mcp_oauth_token_validations_total{job="{{ include "nextcloud-mcp-server.fullname" . }}"}[10m])) > 0.01 + for: 10m + labels: + severity: warning + annotations: + summary: "High token validation error rate" + description: "Token validation error rate is {{ `{{` }} printf \"%.2f%%\" (mul $value 100) {{ `}}` }} (threshold: 1%)" + + - alert: NextcloudMCPVectorSyncQueueHigh + expr: mcp_vector_sync_queue_size{job="{{ include "nextcloud-mcp-server.fullname" . }}"} > 100 + for: 15m + labels: + severity: warning + annotations: + summary: "Vector sync queue is high" + description: "Vector sync queue size is {{ `{{` }} $value {{ `}}` }} (threshold: 100)" + + - alert: NextcloudMCPQdrantSlowQueries + expr: | + histogram_quantile(0.95, + sum(rate(mcp_db_operation_duration_seconds_bucket{db="qdrant", job="{{ include "nextcloud-mcp-server.fullname" . }}"}[10m])) by (le) + ) > 0.5 + for: 10m + labels: + severity: warning + annotations: + summary: "Qdrant queries are slow" + description: "P95 Qdrant query latency is {{ `{{` }} printf \"%.2fs\" $value {{ `}}` }} (threshold: 0.5s)" +{{- end }} diff --git a/charts/nextcloud-mcp-server/templates/service.yaml b/charts/nextcloud-mcp-server/templates/service.yaml index 2f477ba..af245e0 100644 --- a/charts/nextcloud-mcp-server/templates/service.yaml +++ b/charts/nextcloud-mcp-server/templates/service.yaml @@ -15,5 +15,11 @@ spec: targetPort: http protocol: TCP name: http + {{- if .Values.observability.metrics.enabled }} + - port: {{ .Values.observability.metrics.port }} + targetPort: metrics + protocol: TCP + name: metrics + {{- end }} selector: {{- include "nextcloud-mcp-server.selectorLabels" . | nindent 4 }} diff --git a/charts/nextcloud-mcp-server/templates/servicemonitor.yaml b/charts/nextcloud-mcp-server/templates/servicemonitor.yaml new file mode 100644 index 0000000..13bd34c --- /dev/null +++ b/charts/nextcloud-mcp-server/templates/servicemonitor.yaml @@ -0,0 +1,32 @@ +{{- if and .Values.observability.metrics.enabled .Values.serviceMonitor.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ include "nextcloud-mcp-server.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "nextcloud-mcp-server.labels" . | nindent 4 }} + {{- with .Values.serviceMonitor.labels }} + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + selector: + matchLabels: + {{- include "nextcloud-mcp-server.selectorLabels" . | nindent 6 }} + endpoints: + - port: metrics + path: {{ .Values.observability.metrics.path }} + interval: {{ .Values.serviceMonitor.interval }} + scrapeTimeout: {{ .Values.serviceMonitor.scrapeTimeout }} + scheme: http + relabelings: + # Add namespace label + - sourceLabels: [__meta_kubernetes_namespace] + targetLabel: namespace + # Add pod label + - sourceLabels: [__meta_kubernetes_pod_name] + targetLabel: pod + # Add service label + - sourceLabels: [__meta_kubernetes_service_name] + targetLabel: service +{{- end }} diff --git a/charts/nextcloud-mcp-server/values.yaml b/charts/nextcloud-mcp-server/values.yaml index 06a96df..e6fdcf7 100644 --- a/charts/nextcloud-mcp-server/values.yaml +++ b/charts/nextcloud-mcp-server/values.yaml @@ -168,6 +168,43 @@ securityContext: runAsNonRoot: true runAsUser: 1000 +# Observability Configuration +observability: + # Prometheus metrics + metrics: + enabled: true + port: 9090 + path: /metrics + + # OpenTelemetry tracing + tracing: + enabled: false + endpoint: "" # e.g., "http://opentelemetry-collector:4317" + serviceName: "nextcloud-mcp-server" + samplingRate: 1.0 + + # Logging configuration + logging: + format: json # "json" or "text" + level: INFO + includeTraceContext: true + +# Prometheus ServiceMonitor (requires Prometheus Operator) +serviceMonitor: + enabled: false + interval: 30s + scrapeTimeout: 10s + labels: {} + # Additional labels for ServiceMonitor (e.g., for Prometheus selector) + # Example: { prometheus: kube-prometheus } + +# Prometheus alert rules (requires Prometheus Operator) +prometheusRule: + enabled: false + labels: {} + # Additional labels for PrometheusRule (e.g., for Prometheus selector) + # Example: { prometheus: kube-prometheus } + service: type: ClusterIP port: 8000 From 5e4667a643522774c60f6feca4c41edec8df1b4f Mon Sep 17 00:00:00 2001 From: Chris Coutinho Date: Sun, 9 Nov 2025 09:28:09 +0100 Subject: [PATCH 2/2] fix(readiness): Only check external Qdrant in network mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The readiness probe incorrectly tried to connect to an external Qdrant service even when using memory or persistent mode (embedded Qdrant). This caused pods to never become ready in Kubernetes deployments using the default configuration. Root cause: - In memory/persistent modes, QDRANT_URL env var is NOT set - Readiness check used default 'http://qdrant:6333' anyway - Tried to connect to non-existent service - Connection failed -> 503 -> pod stuck in not-ready state Fix: - Only check external Qdrant health if QDRANT_URL is explicitly set (network mode) - For embedded modes (memory/persistent), report status as 'embedded' without blocking - Background scanner tasks don't block readiness (already non-blocking via anyio.start_soon) This allows pods to become ready immediately when using embedded Qdrant, while still validating external Qdrant connectivity in network mode. Fixes: Kubernetes pods failing readiness check with default Qdrant configuration 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- nextcloud_mcp_server/app.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/nextcloud_mcp_server/app.py b/nextcloud_mcp_server/app.py index ecb4f79..113dff1 100644 --- a/nextcloud_mcp_server/app.py +++ b/nextcloud_mcp_server/app.py @@ -1172,13 +1172,15 @@ def get_app(transport: str = "sse", enabled_apps: list[str] | None = None): checks["auth_configured"] = "error: credentials not set" is_ready = False - # Check Qdrant status if vector sync is enabled + # Check Qdrant status if using network mode (external Qdrant service) + # In-memory and persistent modes use embedded Qdrant, no external service to check vector_sync_enabled = ( os.getenv("VECTOR_SYNC_ENABLED", "false").lower() == "true" ) - if vector_sync_enabled: + qdrant_url = os.getenv("QDRANT_URL") # Only set in network mode + + if vector_sync_enabled and qdrant_url: try: - qdrant_url = os.getenv("QDRANT_URL", "http://qdrant:6333") async with httpx.AsyncClient(timeout=2.0) as client: response = await client.get(f"{qdrant_url}/readyz") if response.status_code == 200: @@ -1189,6 +1191,9 @@ def get_app(transport: str = "sse", enabled_apps: list[str] | None = None): except Exception as e: checks["qdrant"] = f"error: {str(e)}" is_ready = False + elif vector_sync_enabled: + # Using embedded Qdrant (memory or persistent mode) + checks["qdrant"] = "embedded" status_code = 200 if is_ready else 503 return JSONResponse(