diff --git a/deploy/kubernetes/controller/service.yaml b/deploy/kubernetes/controller/service.yaml index 7fa410b1..8089814b 100644 --- a/deploy/kubernetes/controller/service.yaml +++ b/deploy/kubernetes/controller/service.yaml @@ -4,7 +4,7 @@ metadata: name: hcloud-csi-controller-metrics namespace: kube-system labels: - app: hcloud-csi + app: hcloud-csi-controller spec: selector: app: hcloud-csi-controller diff --git a/deploy/kubernetes/service-monitor/kustomization.yaml b/deploy/kubernetes/service-monitor/kustomization.yaml new file mode 100644 index 00000000..c82ed36f --- /dev/null +++ b/deploy/kubernetes/service-monitor/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: +- service-monitor.yaml diff --git a/deploy/kubernetes/service-monitor/service-monitor.yaml b/deploy/kubernetes/service-monitor/service-monitor.yaml new file mode 100644 index 00000000..b2383404 --- /dev/null +++ b/deploy/kubernetes/service-monitor/service-monitor.yaml @@ -0,0 +1,33 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: hcloud-csi-controller + namespace: kube-system + labels: + release: YOUR_RELEASE +spec: + endpoints: + - port: metrics + scheme: http + jobLabel: app + selector: + matchLabels: + app: hcloud-csi-controller + +--- + +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: hcloud-csi-node + namespace: kube-system + labels: + release: YOUR_RELEASE +spec: + endpoints: + - port: metrics + scheme: http + jobLabel: app + selector: + matchLabels: + app: hcloud-csi diff --git a/deploy/monitoring/grafana-dashboard.json b/deploy/monitoring/grafana-dashboard.json new file mode 100644 index 00000000..9a262a07 --- /dev/null +++ b/deploy/monitoring/grafana-dashboard.json @@ -0,0 +1,1770 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "9.3.6" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "stat", + "name": "Stat", + "version": "" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "description": "", + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": null, + "graphTooltip": 1, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 22, + "panels": [], + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "Global", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "#d44a3a", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 33 + }, + { + "color": "#299c46", + "value": 66 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 0, + "y": 1 + }, + "id": 15, + "links": [], + "maxDataPoints": 100, + "options": { + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(up{pod=~\"$pod\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{pod}}", + "range": true, + "refId": "Available" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "count(up{pod=~\"$pod\"})", + "hide": false, + "legendFormat": "total", + "range": true, + "refId": "Total" + } + ], + "title": "Available Pods", + "transformations": [ + { + "id": "configFromData", + "options": { + "applyTo": { + "id": "byType", + "options": "number" + }, + "configRefId": "Total", + "mappings": [ + { + "fieldName": "total", + "handlerKey": "max" + } + ] + } + } + ], + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 95 + }, + { + "color": "#d44a3a", + "value": 99 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 1 + }, + "id": 12, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "background", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "(sum(rate(grpc_server_handled_total{pod=~\"$pod\",grpc_type=\"unary\",grpc_code=\"OK\"}[$__rate_interval])) + \nsum(rate(grpc_server_handled_total{pod=~\"$pod\",grpc_type=\"unary\",grpc_code=\"NotFound\"}[$__rate_interval])) + \nsum(rate(grpc_server_handled_total{pod=~\"$pod\",grpc_type=\"unary\",grpc_code=\"PermissionDenied\"}[$__rate_interval])) + \nsum(rate(grpc_server_handled_total{pod=~\"$pod\",grpc_type=\"unary\",grpc_code=\"InvalidArgument\"}[$__rate_interval])))\n/ \nsum(rate(grpc_server_started_total{pod=~\"$pod\",grpc_type=\"unary\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "range": true, + "refId": "A" + } + ], + "title": "Global Success Rate", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Without csi.v1.Identity.Probe (health checks)", + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "rgb(31, 120, 193)", + "mode": "fixed" + }, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 16, + "y": 1 + }, + "id": 14, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "none", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "textMode": "auto" + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(irate(grpc_server_started_total{pod=~\"$pod\",grpc_method!=\"Probe\"}[$__rate_interval]))", + "format": "time_series", + "intervalFactor": 1, + "range": true, + "refId": "A" + } + ], + "title": "CSI QPS", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 0, + "y": 6 + }, + "id": 28, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(container_memory_working_set_bytes{pod=~\"$pod\",container!=\"\"}) by (pod, container)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container}} ({{pod}})", + "metric": "process_resident_memory_bytes", + "range": true, + "refId": "A", + "step": 4 + } + ], + "title": "Pod Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 12, + "x": 12, + "y": 6 + }, + "id": 34, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by(container,pod)(rate(container_cpu_usage_seconds_total{pod=~\"$pod\",container!=\"\"}[$__interval]))", + "intervalFactor": 2, + "legendFormat": "{{container}} ({{pod}})", + "metric": "go_memstats_alloc_bytes", + "range": true, + "refId": "A", + "step": 4 + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 12 + }, + "id": 56, + "panels": [], + "title": "CSI gRPC", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": true, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 13 + }, + "id": 2, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(irate(grpc_server_started_total{pod=~\"$pod\"}[$__rate_interval])) by (grpc_service, pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{grpc_service}} ({{pod}})", + "range": true, + "refId": "A" + } + ], + "title": "QPS", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "ms" + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 19 + }, + "id": 16, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg(rate(grpc_server_handling_seconds_sum{pod=~\"$pod\",grpc_type=\"unary\"}[$__rate_interval])) by (grpc_service, pod) * 1000", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{grpc_service}} ({{pod}})", + "range": true, + "refId": "A" + } + ], + "title": "Average handling time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "short" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 26, + "links": [], + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(grpc_server_handled_total{pod=~\"$pod\",grpc_type=\"unary\"}[$__rate_interval])) by (grpc_code, pod)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{grpc_code}} ({{pod}})", + "range": true, + "refId": "A" + } + ], + "title": "gRPC request code", + "type": "timeseries" + }, + { + "collapsed": false, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 20, + "panels": [], + "repeat": "grpc_service", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "refId": "A" + } + ], + "title": "$grpc_service gRPC requests", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 155, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(grpc_server_handled_total{pod=~\"$pod\",grpc_service=~\"$grpc_service\"}[$__rate_interval])) by (grpc_method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{grpc_method}}", + "range": true, + "refId": "A" + } + ], + "title": "Request rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 36 + }, + "id": 8, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.99, \n sum(rate(grpc_server_handling_seconds_bucket{pod=~\"$pod\",grpc_service=~\"$grpc_service\"}[$__rate_interval])) by (grpc_method,le)\n)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{grpc_method}}", + "range": true, + "refId": "A" + } + ], + "title": "99%-tile latency of requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 44 + }, + "id": 4, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(grpc_server_handled_total{pod=~\"$pod\",grpc_code!=\"OK\",grpc_service=~\"$grpc_service\"}[$__rate_interval])) by (grpc_method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{grpc_method}}", + "range": true, + "refId": "A" + } + ], + "title": "Request error rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 16, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 44 + }, + "id": 6, + "links": [], + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(grpc_server_handled_total{pod=~\"$pod\",grpc_code!=\"OK\",grpc_service=~\"$grpc_service\"}[$__rate_interval])) by (grpc_method)\n/ \nsum(rate(grpc_server_started_total{pod=~\"$pod\",grpc_service=~\"$grpc_service\"}[$__rate_interval])) by (grpc_method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{grpc_method}}", + "range": true, + "refId": "A" + } + ], + "title": "Request error percentage", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 86 + }, + "id": 68, + "panels": [], + "title": "Hetzner Cloud API", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Total number of requests made per endpoint.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 87 + }, + "id": 124, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum by (method, api_endpoint) (rate(hcloud_api_requests_total{pod=~\"$pod\"}[$__rate_interval]))", + "legendFormat": "{{method}} {{api_endpoint}}", + "range": true, + "refId": "A" + } + ], + "title": "Requests per Endpoint", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Responses from Hetzner Cloud API by HTTP code", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic", + "seriesBy": "last" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 87 + }, + "id": 102, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(hcloud_api_requests_total{pod=~\"$pod\"}[$__rate_interval])) by (code)", + "legendFormat": "{{code}}", + "range": true, + "refId": "A" + } + ], + "title": "HTTP Response Code", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "Requests made over 1 hour window. Relevant for Rate Limiting Issues", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 95 + }, + "id": 90, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "sum(delta(hcloud_api_requests_total{pod=~\"$pod\"}[1h]))", + "instant": false, + "legendFormat": "Requests over 1hr window", + "range": true, + "refId": "A" + } + ], + "title": "1h Total Request", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "scaleDistribution": { + "type": "linear" + } + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 95 + }, + "id": 146, + "options": { + "calculate": false, + "cellGap": 1, + "cellValues": { + "unit": "reqps" + }, + "color": { + "exponent": 0.5, + "fill": "dark-orange", + "min": 0, + "mode": "scheme", + "reverse": false, + "scale": "exponential", + "scheme": "Oranges", + "steps": 32 + }, + "exemplars": { + "color": "rgba(255,0,255,0.7)" + }, + "filterValues": { + "le": 1e-9 + }, + "legend": { + "show": true + }, + "rowsFrame": { + "layout": "auto" + }, + "tooltip": { + "show": true, + "yHistogram": false + }, + "yAxis": { + "axisPlacement": "left", + "reverse": false, + "unit": "s" + } + }, + "pluginVersion": "9.3.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "sum(rate(hcloud_api_request_duration_seconds_bucket[$__rate_interval])) by (le)", + "format": "heatmap", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Response Time", + "type": "heatmap" + } + ], + "refresh": "10s", + "schemaVersion": 37, + "style": "dark", + "tags": [ + "hcloud", + "csi-driver", + "kubernetes", + "grpc" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "Data source", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(up{job=~\"hcloud-csi|hcloud-csi-controller\"}, pod)", + "hide": 0, + "includeAll": true, + "label": "Pod", + "multi": true, + "name": "pod", + "options": [], + "query": { + "query": "label_values(up{job=~\"hcloud-csi|hcloud-csi-controller\"}, pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "definition": "label_values(grpc_server_handled_total{job=~\"hcloud-csi|hcloud-csi-controller\"}, grpc_service)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "grpc_service", + "options": [], + "query": { + "query": "label_values(grpc_server_handled_total{job=~\"hcloud-csi|hcloud-csi-controller\"}, grpc_service)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "hcloud-csi-driver", + "uid": "Pa8xLJidC", + "version": 11, + "weekStart": "" +} diff --git a/docs/monitoring.md b/docs/monitoring.md new file mode 100644 index 00000000..0f0c38de --- /dev/null +++ b/docs/monitoring.md @@ -0,0 +1,43 @@ +# Monitoring + +Monitoring is an important part of managing any system, and the csi-driver components are no exception. +To help you keep an eye on how the components are performing, we've exposed Prometheus-compatible metrics on port 9189. +You can configure the endpoint for these metrics by setting the `METRICS_ENDPOINT` environment variable to the appropriate value for your system. + +The metrics exposed include the following: + +- Go Runtime +- gRPC Server for CSI calls +- HTTP calls made to Hetzner Cloud API + +## Scraping on Kubernetes + +There are two ways to scrape metrics on Kubernetes: + +### Using `kubernetes_sd_configs` + +If you're using `kubernetes_sd_configs`, you can configure scraping by adding some annotations to the csi-driver workloads. +Specifically, you'll need to add these annotations to the Deployment `hcloud-csi-controller` and the DaemonSet `hcloud-csi-node`: + +```yaml +annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9189" +``` + +With these annotations in place, Prometheus should be able to scrape metrics from the csi-driver components. + +### prometheus-operator `ServiceMonitor` + +If you're using the prometheus-operator, you can use our `ServiceMonitors` to set up scraping. +You can find these ServiceMonitors in [`deploy/kubernetes/service-monitor`](../deploy/kubernetes/service-monitor/). + +To use these `ServiceMonitors`, you'll need to replace `ServiceMonitor.metadata.labels.release: YOUR_RELEASE` with the value that you've configured in your `Prometheus` resource. +This will ensure that the `ServiceMonitors` actually scrape the appropriate targets. + +## Grafana Dashboard + +In addition to scraping metrics, you'll also want a way to visualize those metrics. +To help with this, we provide a default Grafana dashboard that can be used to display the most important metrics. +This dashboard has been confirmed to work with kube-prometheus-stack, but it may require some tweaking to work correctly in your specific environment. +You can find the dashboard at [`deploy/monitoring/grafana-dashboard.json`](../deploy/monitoring/grafana-dashboard.json).