Skip to content

Commit d12ead4

Browse files
apollo_dashboard: revamp pod panels (#10229)
1 parent 6acb924 commit d12ead4

File tree

2 files changed

+264
-33
lines changed

2 files changed

+264
-33
lines changed

crates/apollo_dashboard/resources/dev_grafana.json

Lines changed: 69 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3192,31 +3192,88 @@
31923192
"Pod Metrics": {
31933193
"panels": [
31943194
{
3195-
"title": "pod_memory_utilization",
3196-
"description": "Pod Memory Utilization",
3195+
"title": "Pod CPU Request Utilization",
3196+
"description": "Pod CPU utilization (usage / requests) (5m window)",
31973197
"type": "timeseries",
31983198
"exprs": [
3199-
"container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}"
3199+
"\n (\n sum by (namespace, pod) (\n rate(container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[5m])\n )\n )\n /\n (\n sum by (namespace, pod) (\n kube_pod_container_resource_requests_cpu_cores{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n )\n )\n "
32003200
],
3201-
"extra_params": {}
3201+
"extra_params": {
3202+
"unit": "percentunit",
3203+
"legends": [
3204+
"{{pod}}"
3205+
]
3206+
}
32023207
},
32033208
{
3204-
"title": "pod_disk_utilization",
3205-
"description": "Pod Disk Utilization",
3209+
"title": "Pod CPU throttling",
3210+
"description": "Pod CPU throttling (throttled / total periods) (5m window)",
32063211
"type": "timeseries",
32073212
"exprs": [
3208-
"kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}"
3213+
"(\n sum by (namespace, pod) (\n rate(container_cpu_cfs_throttled_periods_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[5m])\n )\n )\n /\n (\n sum by (namespace, pod) (\n rate(container_cpu_cfs_periods_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}[5m])\n )\n )\n "
32093214
],
3210-
"extra_params": {}
3215+
"extra_params": {
3216+
"unit": "percentunit",
3217+
"legends": [
3218+
"{{pod}}"
3219+
]
3220+
}
32113221
},
32123222
{
3213-
"title": "pod_cpu_utilization",
3214-
"description": "Pod CPU Utilization",
3223+
"title": "Pod Memory Request Utilization",
3224+
"description": "Pod memory utilization (used / requests)",
32153225
"type": "timeseries",
32163226
"exprs": [
3217-
"container_cpu_usage_seconds_total{cluster=~\"$cluster\", namespace=~\"$namespace\"}"
3227+
"\n (\n sum by (namespace, pod) (\n container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n )\n )\n /\n (\n sum by (namespace, pod) (\n kube_pod_container_resource_requests_memory_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n )\n )\n "
32183228
],
3219-
"extra_params": {}
3229+
"extra_params": {
3230+
"unit": "percentunit",
3231+
"legends": [
3232+
"{{pod}}"
3233+
]
3234+
}
3235+
},
3236+
{
3237+
"title": "Pod Memory Limit Utilization",
3238+
"description": "Pod memory limit utilization (used / limits)",
3239+
"type": "timeseries",
3240+
"exprs": [
3241+
"\n (\n sum by (namespace, pod) (\n container_memory_working_set_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n )\n )\n /\n (\n sum by (namespace, pod) (\n container_spec_memory_limit_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n )\n )\n "
3242+
],
3243+
"extra_params": {
3244+
"unit": "percentunit",
3245+
"legends": [
3246+
"{{pod}}"
3247+
]
3248+
}
3249+
},
3250+
{
3251+
"title": "Pod Disk Utilization",
3252+
"description": "Pod disk utilization (used / capacity)",
3253+
"type": "timeseries",
3254+
"exprs": [
3255+
"\n (\n sum by (namespace, pod) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n )\n )\n /\n (\n sum by (namespace, pod) (\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n )\n )\n "
3256+
],
3257+
"extra_params": {
3258+
"unit": "percentunit",
3259+
"legends": [
3260+
"{{pod}}"
3261+
]
3262+
}
3263+
},
3264+
{
3265+
"title": "Pod Disk Limit Utilization",
3266+
"description": "Pod disk limit utilization (used / capacity)",
3267+
"type": "timeseries",
3268+
"exprs": [
3269+
"\n (\n sum by (namespace, pod) (\n kubelet_volume_stats_used_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n )\n )\n /\n (\n sum by (namespace, pod) (\n kubelet_volume_stats_capacity_bytes{cluster=~\"$cluster\", namespace=~\"$namespace\"}\n )\n )\n "
3270+
],
3271+
"extra_params": {
3272+
"unit": "percentunit",
3273+
"legends": [
3274+
"{{pod}}"
3275+
]
3276+
}
32203277
}
32213278
],
32223279
"collapsed": true
Lines changed: 195 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,215 @@
11
use apollo_metrics::metric_definitions::METRIC_LABEL_FILTER;
22

3-
use crate::dashboard::{Panel, PanelType, Row};
3+
use crate::dashboard::{Panel, PanelType, Row, Unit};
4+
use crate::infra_panels::POD_LEGEND;
45

5-
fn get_pod_memory_utilization_panel() -> Panel {
6+
// TODO(Tsabary): add thresholds.
7+
// TODO(Tsabary): replace query building with relevant functions and templates.
8+
9+
pub(crate) fn get_pod_metrics_row() -> Row {
10+
Row::new(
11+
"Pod Metrics",
12+
vec![
13+
get_pod_cpu_request_utilization_panel(),
14+
get_pod_cpu_throttling_panel(),
15+
get_pod_memory_request_utilization_panel(),
16+
get_pod_memory_limit_utilization_panel(),
17+
get_pod_disk_utilization_panel(),
18+
get_pod_disk_limit_utilization_panel(),
19+
],
20+
)
21+
}
22+
23+
const POD_METRICS_DEFAULT_DURATION: &str = "5m";
24+
25+
// ---------------------------- CPU ----------------------------
26+
27+
// Pod CPU utilization as a ratio of:
28+
// total CPU usage rate of containers in the pod (in cores)
29+
// --------------------------------------------------------
30+
// total CPU cores requested by containers in the pod
31+
// Aggregated per (namespace, pod), the result is a value between 0.0 and 1.0 per pod.
32+
// Interpreted as: "How much of its requested CPU is this pod actually using?"
33+
fn get_pod_cpu_request_utilization_panel() -> Panel {
634
Panel::new(
7-
"pod_memory_utilization",
8-
"Pod Memory Utilization",
9-
format!("container_memory_working_set_bytes{METRIC_LABEL_FILTER}"),
35+
"Pod CPU Request Utilization",
36+
format!("Pod CPU utilization (usage / requests) ({POD_METRICS_DEFAULT_DURATION} window)"),
37+
format!(
38+
"
39+
(
40+
sum by (namespace, pod) (
41+
rate(container_cpu_usage_seconds_total{METRIC_LABEL_FILTER}[{POD_METRICS_DEFAULT_DURATION}])
42+
)
43+
)
44+
/
45+
(
46+
sum by (namespace, pod) (
47+
kube_pod_container_resource_requests_cpu_cores{METRIC_LABEL_FILTER}
48+
)
49+
)
50+
"
51+
),
1052
PanelType::TimeSeries,
1153
)
54+
.with_legends(POD_LEGEND)
55+
.with_unit(Unit::PercentUnit)
1256
}
1357

14-
fn get_pod_disk_utilization_panel() -> Panel {
58+
// Pod CPU throttling as a ratio of:
59+
// number of CFS CPU periods where containers in the pod were throttled
60+
// --------------------------------------------------------------------
61+
// total number of CFS CPU periods for containers in the pod
62+
// Aggregated per (namespace, pod), the result is a value between 0.0 and 1.0 per pod.
63+
// Interpreted as: "What fraction of time is this pod being CPU-throttled by its CPU *limit*?"
64+
fn get_pod_cpu_throttling_panel() -> Panel {
1565
Panel::new(
16-
"pod_disk_utilization",
17-
"Pod Disk Utilization",
18-
format!("kubelet_volume_stats_used_bytes{METRIC_LABEL_FILTER}"),
66+
"Pod CPU throttling",
67+
format!("Pod CPU throttling (throttled / total periods) ({POD_METRICS_DEFAULT_DURATION} window)"),
68+
format!(
69+
"(
70+
sum by (namespace, pod) (
71+
rate(container_cpu_cfs_throttled_periods_total{METRIC_LABEL_FILTER}[{POD_METRICS_DEFAULT_DURATION}])
72+
)
73+
)
74+
/
75+
(
76+
sum by (namespace, pod) (
77+
rate(container_cpu_cfs_periods_total{METRIC_LABEL_FILTER}[{POD_METRICS_DEFAULT_DURATION}])
78+
)
79+
)
80+
"
81+
),
1982
PanelType::TimeSeries,
2083
)
84+
.with_legends(POD_LEGEND)
85+
.with_unit(Unit::PercentUnit)
2186
}
2287

23-
fn get_pod_cpu_utilization_panel() -> Panel {
88+
// ---------------------------- MEMORY ----------------------------
89+
90+
// Pod memory utilization as a ratio of:
91+
// total memory used by containers in the pod
92+
// ------------------------------------------------
93+
// total memory requested by containers in the pod
94+
// Aggregated per (namespace, pod), the result is a value between 0.0 and 1.0 per pod.
95+
// Interpreted as: "How much of its requested memory is this pod actually using?"
96+
fn get_pod_memory_request_utilization_panel() -> Panel {
2497
Panel::new(
25-
"pod_cpu_utilization",
26-
"Pod CPU Utilization",
27-
format!("container_cpu_usage_seconds_total{METRIC_LABEL_FILTER}"),
98+
"Pod Memory Request Utilization",
99+
"Pod memory utilization (used / requests)",
100+
format!(
101+
"
102+
(
103+
sum by (namespace, pod) (
104+
container_memory_working_set_bytes{METRIC_LABEL_FILTER}
105+
)
106+
)
107+
/
108+
(
109+
sum by (namespace, pod) (
110+
kube_pod_container_resource_requests_memory_bytes{METRIC_LABEL_FILTER}
111+
)
112+
)
113+
"
114+
),
28115
PanelType::TimeSeries,
29116
)
117+
.with_legends(POD_LEGEND)
118+
.with_unit(Unit::PercentUnit)
30119
}
31120

32-
pub(crate) fn get_pod_metrics_row() -> Row {
33-
Row::new(
34-
"Pod Metrics",
35-
vec![
36-
get_pod_memory_utilization_panel(),
37-
get_pod_disk_utilization_panel(),
38-
get_pod_cpu_utilization_panel(),
39-
],
121+
// Pod memory limit utilization as a ratio of:
122+
// total memory used by containers in the pod
123+
// ------------------------------------------
124+
// total memory limit of containers in the pod
125+
// Aggregated per (namespace, pod), the result is a value between 0.0 and 1.0 per pod.
126+
// Interpreted as: "How close is this pod to its memory *limit* (OOM-kill threshold)?"
127+
// Note: memory is not throttled like CPU; crossing this limit results in OOM kills.
128+
fn get_pod_memory_limit_utilization_panel() -> Panel {
129+
Panel::new(
130+
"Pod Memory Limit Utilization",
131+
"Pod memory limit utilization (used / limits)",
132+
format!(
133+
"
134+
(
135+
sum by (namespace, pod) (
136+
container_memory_working_set_bytes{METRIC_LABEL_FILTER}
137+
)
138+
)
139+
/
140+
(
141+
sum by (namespace, pod) (
142+
container_spec_memory_limit_bytes{METRIC_LABEL_FILTER}
143+
)
144+
)
145+
"
146+
),
147+
PanelType::TimeSeries,
148+
)
149+
.with_legends(POD_LEGEND)
150+
.with_unit(Unit::PercentUnit)
151+
}
152+
153+
// ---------------------------- DISK ----------------------------
154+
155+
// Pod disk utilization (PVC) as a ratio of:
156+
// total volume bytes used by the pod
157+
// ----------------------------------
158+
// total volume capacity bytes of the pod
159+
// Aggregated per (namespace, pod), the result is a value between 0.0 and 1.0 per pod.
160+
// Interpreted as: "How much of the provisioned PVC capacity is this pod using?"
161+
fn get_pod_disk_utilization_panel() -> Panel {
162+
Panel::new(
163+
"Pod Disk Utilization",
164+
"Pod disk utilization (used / capacity)",
165+
format!(
166+
"
167+
(
168+
sum by (namespace, pod) (
169+
kubelet_volume_stats_used_bytes{METRIC_LABEL_FILTER}
170+
)
171+
)
172+
/
173+
(
174+
sum by (namespace, pod) (
175+
kubelet_volume_stats_capacity_bytes{METRIC_LABEL_FILTER}
176+
)
177+
)
178+
"
179+
),
180+
PanelType::TimeSeries,
181+
)
182+
.with_legends(POD_LEGEND)
183+
.with_unit(Unit::PercentUnit)
184+
}
185+
186+
// Pod disk limit utilization (PVC) as a ratio of:
187+
// total volume bytes used by the pod
188+
// ----------------------------------
189+
// total volume capacity bytes of the pod (effective disk limit)
190+
// Aggregated per (namespace, pod), the result is a value between 0.0 and 1.0 per pod.
191+
// Interpreted as: "How close is this pod's PVC storage to being full (disk *limit* saturation)?"
192+
fn get_pod_disk_limit_utilization_panel() -> Panel {
193+
Panel::new(
194+
"Pod Disk Limit Utilization",
195+
"Pod disk limit utilization (used / capacity)",
196+
format!(
197+
"
198+
(
199+
sum by (namespace, pod) (
200+
kubelet_volume_stats_used_bytes{METRIC_LABEL_FILTER}
201+
)
202+
)
203+
/
204+
(
205+
sum by (namespace, pod) (
206+
kubelet_volume_stats_capacity_bytes{METRIC_LABEL_FILTER}
207+
)
208+
)
209+
"
210+
),
211+
PanelType::TimeSeries,
40212
)
213+
.with_legends(POD_LEGEND)
214+
.with_unit(Unit::PercentUnit)
41215
}

0 commit comments

Comments
 (0)