093ac5b5ba
Add comprehensive observability configuration to Helm chart: **Helm Values:** - Add observability configuration section for metrics, tracing, and logging - Add serviceMonitor configuration (disabled by default) - Add prometheusRule configuration (disabled by default) **Templates:** - Update deployment to include observability environment variables - Update deployment to expose metrics port (9090) - Update service to expose metrics port - Add ServiceMonitor template for Prometheus Operator - Add PrometheusRule template with critical and warning alerts **Dashboards:** - Add comprehensive Grafana dashboard JSON with 6 panels: - Request Rate (by method and endpoint) - Error Rate (5xx errors percentage) - Request Latency (P50/P95 by endpoint) - Top MCP Tools (by invocation volume) - Nextcloud API Latency (by app) - Vector Sync Queue Size - Add dashboard README with import instructions **Alert Rules:** - Critical: Server down, high error rate (>5%), high latency (>1s), dependency down - Warning: Token validation errors (>1%), vector sync queue high (>100), Qdrant slow (>500ms) All features are opt-in via values.yaml configuration. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
93 lines
3.9 KiB
YAML
93 lines
3.9 KiB
YAML
{{- if and .Values.observability.metrics.enabled .Values.prometheusRule.enabled }}
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: {{ include "nextcloud-mcp-server.fullname" . }}
|
|
namespace: {{ .Release.Namespace }}
|
|
labels:
|
|
{{- include "nextcloud-mcp-server.labels" . | nindent 4 }}
|
|
{{- with .Values.prometheusRule.labels }}
|
|
{{- toYaml . | nindent 4 }}
|
|
{{- end }}
|
|
spec:
|
|
groups:
|
|
- name: nextcloud-mcp-server.critical
|
|
interval: 30s
|
|
rules:
|
|
- alert: NextcloudMCPServerDown
|
|
expr: up{job="{{ include "nextcloud-mcp-server.fullname" . }}"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Nextcloud MCP Server is down"
|
|
description: "{{ `{{` }} $labels.pod {{ `}}` }} has been down for more than 5 minutes."
|
|
|
|
- alert: NextcloudMCPHighErrorRate
|
|
expr: |
|
|
sum(rate(mcp_http_requests_total{status_code=~"5..", job="{{ include "nextcloud-mcp-server.fullname" . }}"}[5m]))
|
|
/ sum(rate(mcp_http_requests_total{job="{{ include "nextcloud-mcp-server.fullname" . }}"}[5m])) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High error rate on Nextcloud MCP Server"
|
|
description: "Error rate is {{ `{{` }} printf \"%.2f%%\" (mul $value 100) {{ `}}` }} (threshold: 5%)"
|
|
|
|
- alert: NextcloudMCPHighLatency
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(mcp_http_request_duration_seconds_bucket{job="{{ include "nextcloud-mcp-server.fullname" . }}"}[5m])) by (le, endpoint)
|
|
) > 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "High latency on Nextcloud MCP Server"
|
|
description: "P95 latency is {{ `{{` }} printf \"%.2fs\" $value {{ `}}` }} on {{ `{{` }} $labels.endpoint {{ `}}` }} (threshold: 1s)"
|
|
|
|
- alert: NextcloudMCPDependencyDown
|
|
expr: mcp_dependency_health{job="{{ include "nextcloud-mcp-server.fullname" . }}"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Nextcloud MCP dependency is down"
|
|
description: "Dependency {{ `{{` }} $labels.dependency {{ `}}` }} has been down for more than 2 minutes."
|
|
|
|
- name: nextcloud-mcp-server.warning
|
|
interval: 30s
|
|
rules:
|
|
- alert: NextcloudMCPTokenValidationErrors
|
|
expr: |
|
|
sum(rate(mcp_oauth_token_validations_total{result="error", job="{{ include "nextcloud-mcp-server.fullname" . }}"}[10m]))
|
|
/ sum(rate(mcp_oauth_token_validations_total{job="{{ include "nextcloud-mcp-server.fullname" . }}"}[10m])) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High token validation error rate"
|
|
description: "Token validation error rate is {{ `{{` }} printf \"%.2f%%\" (mul $value 100) {{ `}}` }} (threshold: 1%)"
|
|
|
|
- alert: NextcloudMCPVectorSyncQueueHigh
|
|
expr: mcp_vector_sync_queue_size{job="{{ include "nextcloud-mcp-server.fullname" . }}"} > 100
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Vector sync queue is high"
|
|
description: "Vector sync queue size is {{ `{{` }} $value {{ `}}` }} (threshold: 100)"
|
|
|
|
- alert: NextcloudMCPQdrantSlowQueries
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(mcp_db_operation_duration_seconds_bucket{db="qdrant", job="{{ include "nextcloud-mcp-server.fullname" . }}"}[10m])) by (le)
|
|
) > 0.5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Qdrant queries are slow"
|
|
description: "P95 Qdrant query latency is {{ `{{` }} printf \"%.2fs\" $value {{ `}}` }} (threshold: 0.5s)"
|
|
{{- end }}
|