|
18 | 18 | ||| % { etcd_instance_labels: $._config.etcd_instance_labels, etcd_selector: $._config.etcd_selector, network_failure_range: $._config.scrape_interval_seconds * 4 }, |
19 | 19 | 'for': '20m', |
20 | 20 | labels: { |
21 | | - severity: 'warning', |
| 21 | + severity: 'critical', |
22 | 22 | }, |
23 | 23 | annotations: { |
24 | 24 | description: 'etcd cluster "{{ $labels.%s }}": members are down ({{ $value }}).' % $._config.clusterLabel, |
|
35 | 35 | severity: 'critical', |
36 | 36 | }, |
37 | 37 | annotations: { |
38 | | - description: 'etcd cluster "{{ $labels.%s }}": insufficient members ({{ $value }}).' % $._config.clusterLabel, |
39 | | - summary: 'etcd cluster has insufficient number of members.', |
| 38 | + description: 'etcd cluster "{{ $labels.%s }}": is reporting fewer instances are available than are needed ({{ $value }}). When etcd does not have a majority of instances available the Kubernetes APIs will reject read and write requests and operations that preserve the health of workloads cannot be performed. This can occur when multiple control plane nodes are powered off or are unable to connect to each other via the network. Check that all control plane nodes are powered on and that network connections between each machine are functional.' % $._config.clusterLabel, |
| 39 | + summary: 'etcd is reporting that a majority of instances are unavailable.', |
40 | 40 | }, |
41 | 41 | }, |
42 | 42 | { |
|
56 | 56 | { |
57 | 57 | alert: 'etcdHighNumberOfLeaderChanges', |
58 | 58 | expr: ||| |
59 | | - increase((max without (%(etcd_instance_labels)s) (etcd_server_leader_changes_seen_total{%(etcd_selector)s}) or 0*absent(etcd_server_leader_changes_seen_total{%(etcd_selector)s}))[15m:1m]) >= 4 |
| 59 | + avg by (job) (changes(etcd_server_is_leader{%(etcd_selector)s}[10m])) > 5 |
60 | 60 | ||| % $._config, |
61 | 61 | 'for': '5m', |
62 | 62 | labels: { |
63 | 63 | severity: 'warning', |
64 | 64 | }, |
65 | 65 | annotations: { |
66 | | - description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' % $._config.clusterLabel, |
| 66 | + description: 'etcd cluster "{{ $labels.%s }}": {{ $value }} leader changes within the last 10 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' % $._config.clusterLabel, |
67 | 67 | summary: 'etcd cluster has high number of leader changes.', |
68 | 68 | }, |
69 | 69 | }, |
|
149 | 149 | alert: 'etcdHighFsyncDurations', |
150 | 150 | expr: ||| |
151 | 151 | histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m])) |
152 | | - > 0.5 |
| 152 | + > 0.05 |
153 | 153 | ||| % $._config, |
154 | 154 | 'for': '10m', |
155 | 155 | labels: { |
|
164 | 164 | alert: 'etcdHighFsyncDurations', |
165 | 165 | expr: ||| |
166 | 166 | histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{%(etcd_selector)s}[5m])) |
167 | | - > 1 |
| 167 | + > 0.07 |
168 | 168 | ||| % $._config, |
169 | 169 | 'for': '10m', |
170 | 170 | labels: { |
|
179 | 179 | alert: 'etcdHighCommitDurations', |
180 | 180 | expr: ||| |
181 | 181 | histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m])) |
182 | | - > 0.25 |
| 182 | + > 0.08 |
183 | 183 | ||| % $._config, |
184 | 184 | 'for': '10m', |
185 | 185 | labels: { |
|
190 | 190 | summary: 'etcd cluster 99th percentile commit durations are too high.', |
191 | 191 | }, |
192 | 192 | }, |
| 193 | + { |
| 194 | + alert: 'etcdHighCommitDurations', |
| 195 | + expr: ||| |
| 196 | + histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{%(etcd_selector)s}[5m])) |
| 197 | + > 0.1 |
| 198 | + ||| % $._config, |
| 199 | + 'for': '10m', |
| 200 | + labels: { |
| 201 | + severity: 'critical', |
| 202 | + }, |
| 203 | + annotations: { |
| 204 | + description: 'etcd cluster "{{ $labels.%s }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' % $._config.clusterLabel, |
| 205 | + summary: 'etcd cluster 99th percentile commit durations are too high.', |
| 206 | + }, |
| 207 | + }, |
| 208 | + { |
| 209 | + alert: 'etcdDatabaseQuotaLowSpace', |
| 210 | + expr: ||| |
| 211 | + (last_over_time(etcd_mvcc_db_total_size_in_bytes{%(etcd_selector)s}[5m]) / last_over_time(etcd_server_quota_backend_bytes{%(etcd_selector)s}[5m]))*100 > 65 |
| 212 | + ||| % $._config, |
| 213 | + 'for': '10m', |
| 214 | + labels: { |
| 215 | + severity: 'info', |
| 216 | + }, |
| 217 | + annotations: { |
| 218 | + description: 'etcd cluster "{{ $labels.%s }}": database size is 65 percent of the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' % $._config.clusterLabel, |
| 219 | + summary: 'etcd cluster database is using >= 65 percent of the defined quota.', |
| 220 | + }, |
| 221 | + }, |
| 222 | + { |
| 223 | + alert: 'etcdDatabaseQuotaLowSpace', |
| 224 | + expr: ||| |
| 225 | + (last_over_time(etcd_mvcc_db_total_size_in_bytes{%(etcd_selector)s}[5m]) / last_over_time(etcd_server_quota_backend_bytes{%(etcd_selector)s}[5m]))*100 > 75 |
| 226 | + ||| % $._config, |
| 227 | + 'for': '10m', |
| 228 | + labels: { |
| 229 | + severity: 'warning', |
| 230 | + }, |
| 231 | + annotations: { |
| 232 | + description: 'etcd cluster "{{ $labels.%s }}": database size is 75 percent of the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' % $._config.clusterLabel, |
| 233 | + summary: 'etcd cluster database is using >= 75 percent of the defined quota.', |
| 234 | + }, |
| 235 | + }, |
193 | 236 | { |
194 | 237 | alert: 'etcdDatabaseQuotaLowSpace', |
195 | 238 | expr: ||| |
196 | | - (last_over_time(etcd_mvcc_db_total_size_in_bytes{%(etcd_selector)s}[5m]) / last_over_time(etcd_server_quota_backend_bytes{%(etcd_selector)s}[5m]))*100 > 95 |
| 239 | + (last_over_time(etcd_mvcc_db_total_size_in_bytes{%(etcd_selector)s}[5m]) / last_over_time(etcd_server_quota_backend_bytes{%(etcd_selector)s}[5m]))*100 > 85 |
197 | 240 | ||| % $._config, |
198 | 241 | 'for': '10m', |
199 | 242 | labels: { |
|
0 commit comments