Update contrib/mixin alerting thresholds

tjungblu · tjungblu · commit 3d537d1fa695 · 2025-11-11T15:20:37.000+01:00
With the help of @dgoodwin we were able to identify better threshold based on our fleet telemetry. Along with it, I wanted to contribute some minor improvements we did to our alerts over the years. Here's a summary by Claude: Alert Severity Changes - etcdMembersDown: Increased severity from warning to critical (alerts/alerts.libsonnet:10) Improved Alert Descriptions - etcdInsufficientMembers: Enhanced description with detailed troubleshooting guidance about control plane nodes, network connectivity, and the impact on Kubernetes APIs (alerts/alerts.libsonnet:20-21) Alert Query Improvements - etcdHighNumberOfLeaderChanges: Rewrote query to use changes(etcd_server_is_leader) instead of increase(etcd_server_leader_changes_seen_total), changed time window from 15m to 10m (alerts/alerts.libsonnet:30) More Aggressive Disk Performance Thresholds - etcdHighFsyncDurations (warning): Lowered threshold from 0.5s to 0.05s (alerts/alerts.libsonnet:47) - etcdHighFsyncDurations (critical): Lowered threshold from 1s to 0.07s (alerts/alerts.libsonnet:56) - etcdHighCommitDurations (warning): Lowered threshold from 0.25s to 0.08s (alerts/alerts.libsonnet:65) - etcdHighCommitDurations (critical): Added new critical alert at 0.1s threshold (alerts/alerts.libsonnet:74-87) Database Quota Alerts - etcdDatabaseQuotaLowSpace: Added tiered alerts at 65% (info), 75% (warning), and lowered critical from 95% to 85% (alerts/alerts.libsonnet:89-121) Signed-off-by: Thomas Jungblut <tjungblu@redhat.com>
diff --git a/contrib/mixin/alerts/alerts.libsonnet b/contrib/mixin/alerts/alerts.libsonnet
@@ -18,7 +18,7 @@
             ||| % { etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds * 4 },
             'for': '20m',
             labels: {
-              severity: 'warning',
+              severity: 'critical',
             },
             annotations: {
               description: 'etcd cluster "{{ $labels.%s }}": members are down ({{ $value }}).' % $._config.clusterLabel,
@@ -35,8 +35,8 @@
               severity: 'critical',
             },
             annotations: {
-              description: 'etcd cluster "{{ $labels.%s }}": insufficient members ({{ $value }}).' % $._config.clusterLabel,
-              summary: 'etcd cluster has insufficient number of members.',
+              description: 'etcd cluster "{{ $labels.%s }}": is reporting fewer instances are available than are needed ({{ $value }}). When etcd does not have a majority of instances available the Kubernetes APIs will reject read and write requests and operations that preserve the health of workloads cannot be performed. This can occur when multiple control plane nodes are powered off or are unable to connect to each other via the network. Check that all control plane nodes are powered on and that network connections between each machine are functional.' % $._config.clusterLabel,
+              summary: 'etcd is reporting that a majority of instances are unavailable.',
             },
           },
           {
@@ -56,14 +56,14 @@
           {
             alert: 'etcdHighNumberOfLeaderChanges',
             expr: |||
-              increase((max without (%(etcd_instance_labels)s) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 4
+              avg by (job) (changes(etcd_server_is_leader{%(etcd_selector)s}[10m])) > 5
             ||| % $._config,
             'for': '5m',
             labels: {
               severity: 'warning',
             },
             annotations: {
-              description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' % $._config.clusterLabel,
+              description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} leader changes within the last 10 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' % $._config.clusterLabel,
               summary: 'etcd cluster has high number of leader changes.',
             },
           },
@@ -149,7 +149,7 @@
             alert: 'etcdHighFsyncDurations',
             expr: |||
               histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m]))
-              > 0.5
+              > 0.05
             ||| % $._config,
             'for': '10m',
             labels: {
@@ -164,7 +164,7 @@
             alert: 'etcdHighFsyncDurations',
             expr: |||
               histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m]))
-              > 1
+              > 0.07
             ||| % $._config,
             'for': '10m',
             labels: {
@@ -179,7 +179,7 @@
             alert: 'etcdHighCommitDurations',
             expr: |||
               histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m]))
-              > 0.25
+              > 0.08
             ||| % $._config,
             'for': '10m',
             labels: {
@@ -190,10 +190,53 @@
               summary: 'etcd cluster 99th percentile commit durations are too high.',
             },
           },
+          {
+            alert: 'etcdHighCommitDurations',
+            expr: |||
+              histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m]))
+              > 0.1
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'critical',
+            },
+            annotations: {
+              description: 'etcd cluster "{{ $labels.%s }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel,
+              summary: 'etcd cluster 99th percentile commit durations are too high.',
+            },
+          },
+          {
+            alert: 'etcdDatabaseQuotaLowSpace',
+            expr: |||
+              (last_over_time(etcd_mvcc_db_total_size_in_bytes{%(etcd_selector)s}[5m]) / last_over_time(etcd_server_quota_backend_bytes{%(etcd_selector)s}[5m]))*100 > 65
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'info',
+            },
+            annotations: {
+              description: 'etcd cluster "{{ $labels.%s }}": database size is 65 percent of the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' % $._config.clusterLabel,
+              summary: 'etcd cluster database is using >= 65 percent of the defined quota.',
+            },
+          },
+          {
+            alert: 'etcdDatabaseQuotaLowSpace',
+            expr: |||
+              (last_over_time(etcd_mvcc_db_total_size_in_bytes{%(etcd_selector)s}[5m]) / last_over_time(etcd_server_quota_backend_bytes{%(etcd_selector)s}[5m]))*100 > 75
+            ||| % $._config,
+            'for': '10m',
+            labels: {
+              severity: 'warning',
+            },
+            annotations: {
+              description: 'etcd cluster "{{ $labels.%s }}": database size is 75 percent of the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' % $._config.clusterLabel,
+              summary: 'etcd cluster database is using >= 75 percent of the defined quota.',
+            },
+          },
           {
             alert: 'etcdDatabaseQuotaLowSpace',
             expr: |||
-              (last_over_time(etcd_mvcc_db_total_size_in_bytes{%(etcd_selector)s}[5m]) / last_over_time(etcd_server_quota_backend_bytes{%(etcd_selector)s}[5m]))*100 > 95
+              (last_over_time(etcd_mvcc_db_total_size_in_bytes{%(etcd_selector)s}[5m]) / last_over_time(etcd_server_quota_backend_bytes{%(etcd_selector)s}[5m]))*100 > 85
             ||| % $._config,
             'for': '10m',
             labels: {
diff --git a/contrib/mixin/test.yaml b/contrib/mixin/test.yaml
@@ -22,7 +22,7 @@ tests:
         exp_alerts:
           - exp_labels:
               job: etcd
-              severity: warning
+              severity: critical
             exp_annotations:
               description: 'etcd cluster "etcd": members are down (3).'
               summary: etcd cluster members are down.
@@ -35,17 +35,17 @@ tests:
               job: etcd
               severity: critical
             exp_annotations:
-              description: 'etcd cluster "etcd": insufficient members (1).'
-              summary: etcd cluster has insufficient number of members.
+              description: "etcd cluster \"etcd\": is reporting fewer instances are available than are needed (1). When etcd does not have a majority of instances available the Kubernetes APIs will reject read and write requests and operations that preserve the health of workloads cannot be performed. This can occur when multiple control plane nodes are powered off or are unable to connect to each other via the network. Check that all control plane nodes are powered on and that network connections between each machine are functional."
+              summary: "etcd is reporting that a majority of instances are unavailable."
       - eval_time: 15m
         alertname: etcdInsufficientMembers
         exp_alerts:
           - exp_labels:
               job: etcd
               severity: critical
             exp_annotations:
-              description: 'etcd cluster "etcd": insufficient members (0).'
-              summary: etcd cluster has insufficient number of members.
+              description: "etcd cluster \"etcd\": is reporting fewer instances are available than are needed (0). When etcd does not have a majority of instances available the Kubernetes APIs will reject read and write requests and operations that preserve the health of workloads cannot be performed. This can occur when multiple control plane nodes are powered off or are unable to connect to each other via the network. Check that all control plane nodes are powered on and that network connections between each machine are functional."
+              summary: "etcd is reporting that a majority of instances are unavailable."
   - interval: 1m
     input_series:
       - series: up{job="etcd",instance="10.10.10.0"}
@@ -60,7 +60,7 @@ tests:
         exp_alerts:
           - exp_labels:
               job: etcd
-              severity: warning
+              severity: critical
             exp_annotations:
               description: 'etcd cluster "etcd": members are down (3).'
               summary: etcd cluster members are down.
@@ -78,40 +78,26 @@ tests:
         exp_alerts:
           - exp_labels:
               job: etcd
-              severity: warning
+              severity: critical
             exp_annotations:
               description: 'etcd cluster "etcd": members are down (1).'
               summary: etcd cluster members are down.
   - interval: 1m
     input_series:
-      - series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}
-        values: 0 0 2 0 0 1 0 0 0 0 0 0 0 0 0 0
-      - series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}
-        values: 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
-      - series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}
-        values: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+      - series: etcd_server_is_leader{job="etcd"}
+        values: 0 0 1 0 0 1 0 1 0 1 0 1 1 0 1 0 1 0 1 0 1 0
     alert_rule_test:
-      - eval_time: 10m
+      - eval_time: 5m
+        alertname: etcdHighNumberOfLeaderChanges
+      - eval_time: 15m
         alertname: etcdHighNumberOfLeaderChanges
         exp_alerts:
           - exp_labels:
               job: etcd
               severity: warning
             exp_annotations:
-              description: 'etcd cluster "etcd": 4 leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
+              description: 'etcd cluster "etcd": 9 leader changes within the last 10 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
               summary: etcd cluster has high number of leader changes.
-  - interval: 1m
-    input_series:
-      - series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.0"}
-        values: 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
-      - series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.1"}
-        values: 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
-      - series: etcd_server_leader_changes_seen_total{job="etcd",instance="10.10.10.2"}
-        values: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-    alert_rule_test:
-      - eval_time: 10m
-        alertname: etcdHighNumberOfLeaderChanges
-        exp_alerts:
   - interval: 1m
     input_series:
       - series: etcd_mvcc_db_total_size_in_bytes{job="etcd",instance="10.10.10.0"}