Skip to content

Commit 3ff3e35

Browse files
committed
Add metrics to UI backend
Signed-off-by: Xabier Larrakoetxea <[email protected]>
1 parent cd8eb93 commit 3ff3e35

File tree

9 files changed

+462
-10
lines changed

9 files changed

+462
-10
lines changed

cmd/sloth/commands/server.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,17 @@ import (
1313
"github.com/oklog/run"
1414
promapi "github.com/prometheus/client_golang/api"
1515
promv1 "github.com/prometheus/client_golang/api/prometheus/v1"
16+
"github.com/prometheus/client_golang/prometheus"
1617
"github.com/prometheus/client_golang/prometheus/promhttp"
18+
gohttpmetricsprometheus "github.com/slok/go-http-metrics/metrics/prometheus"
1719

1820
backendapp "github.com/slok/sloth/internal/http/backend/app"
21+
httpbackendmetricsprometheus "github.com/slok/sloth/internal/http/backend/metrics/prometheus"
1922
"github.com/slok/sloth/internal/http/backend/storage"
2023
storagefake "github.com/slok/sloth/internal/http/backend/storage/fake"
2124
storageprometheus "github.com/slok/sloth/internal/http/backend/storage/prometheus"
2225
storagesearch "github.com/slok/sloth/internal/http/backend/storage/search"
26+
storagewrappers "github.com/slok/sloth/internal/http/backend/storage/wrappers"
2327
"github.com/slok/sloth/internal/http/ui"
2428
"github.com/slok/sloth/internal/log"
2529
)
@@ -65,6 +69,7 @@ func (c serverCommand) Run(ctx context.Context, config RootConfig) error {
6569
defer cancel()
6670

6771
logger := config.Logger.WithValues(log.Kv{"command": c.Name()})
72+
promReg := prometheus.DefaultRegisterer
6873

6974
// Prepare vault refresh
7075
var g run.Group
@@ -143,6 +148,9 @@ func (c serverCommand) Run(ctx context.Context, config RootConfig) error {
143148

144149
// Application server.
145150
{
151+
// Metrics for UI backend.
152+
uiBackendMetricsRecorder := httpbackendmetricsprometheus.NewRecorder(promReg)
153+
146154
var repo unifiedRepository
147155

148156
switch {
@@ -159,8 +167,9 @@ func (c serverCommand) Run(ctx context.Context, config RootConfig) error {
159167
}
160168

161169
repo, err = storageprometheus.NewRepository(ctx, storageprometheus.RepositoryConfig{
162-
PrometheusClient: promv1.NewAPI(client),
170+
PrometheusClient: storageprometheus.NewMeasuredPrometheusAPIClient(uiBackendMetricsRecorder, promv1.NewAPI(client)),
163171
CacheRefreshInterval: c.prometheus.cacheInstantRefreshInterval,
172+
MetricsRecorder: uiBackendMetricsRecorder,
164173
Logger: logger,
165174
})
166175
if err != nil {
@@ -170,6 +179,8 @@ func (c serverCommand) Run(ctx context.Context, config RootConfig) error {
170179
return fmt.Errorf("no storage backend configured")
171180
}
172181

182+
repo = newMeasuredUnifiedRepository(repo, uiBackendMetricsRecorder)
183+
173184
// Wrap repo with search capabilities.
174185
repo, err := storagesearch.NewSearchRepositoryWrapper(repo, repo)
175186
if err != nil {
@@ -188,6 +199,10 @@ func (c serverCommand) Run(ctx context.Context, config RootConfig) error {
188199
uiHandler, err := ui.NewUI(ui.UIConfig{
189200
Logger: logger,
190201
ServiceApp: app,
202+
MetricsRecorder: gohttpmetricsprometheus.NewRecorder(gohttpmetricsprometheus.Config{
203+
Prefix: httpbackendmetricsprometheus.Prefix,
204+
Registry: promReg,
205+
}),
191206
})
192207
if err != nil {
193208
return fmt.Errorf("could not create ui handler: %w", err)
@@ -237,3 +252,13 @@ type unifiedRepository interface {
237252
storage.SLOGetter
238253
storage.ServiceGetter
239254
}
255+
256+
func newMeasuredUnifiedRepository(orig unifiedRepository, metricsRecorder httpbackendmetricsprometheus.Recorder) unifiedRepository {
257+
return struct {
258+
storage.SLOGetter
259+
storage.ServiceGetter
260+
}{
261+
SLOGetter: storagewrappers.NewMeasuredSLOGetter(orig, metricsRecorder),
262+
ServiceGetter: storagewrappers.NewMeasuredServiceGetter(orig, metricsRecorder),
263+
}
264+
}

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ require (
6161
github.com/google/uuid v1.6.0 // indirect
6262
github.com/grafana/regexp v0.0.0-20250905093917-f7b3be9d1853 // indirect
6363
github.com/json-iterator/go v1.1.12 // indirect
64+
github.com/kylelemons/godebug v1.1.0 // indirect
6465
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
6566
github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
6667
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package metrics
2+
3+
import (
4+
"context"
5+
"time"
6+
)
7+
8+
type Recorder interface {
9+
MeasureStorageOperationDuration(ctx context.Context, op string, t time.Duration, err error)
10+
MeasurePrometheusStorageBackgroundCacheRefresh(ctx context.Context, t time.Duration, err error)
11+
MeasurePrometheusAPIClientOperation(ctx context.Context, op string, t time.Duration, err error)
12+
}
13+
14+
type noopRecorder bool
15+
16+
var NoopRecorder Recorder = noopRecorder(false)
17+
18+
func (r noopRecorder) MeasureStorageOperationDuration(ctx context.Context, op string, t time.Duration, err error) {
19+
}
20+
21+
func (r noopRecorder) MeasurePrometheusStorageBackgroundCacheRefresh(ctx context.Context, t time.Duration, err error) {
22+
}
23+
24+
func (r noopRecorder) MeasurePrometheusAPIClientOperation(ctx context.Context, op string, t time.Duration, err error) {
25+
}
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
package prometheus
2+
3+
import (
4+
"context"
5+
"strconv"
6+
"time"
7+
8+
"github.com/prometheus/client_golang/prometheus"
9+
)
10+
11+
const (
12+
Prefix = "sloth"
13+
)
14+
15+
type Recorder struct {
16+
reg prometheus.Registerer
17+
18+
storagePromCacheLatency *prometheus.HistogramVec
19+
storageOperationLatency *prometheus.HistogramVec
20+
promAPICliLatency *prometheus.HistogramVec
21+
}
22+
23+
func NewRecorder(reg prometheus.Registerer) Recorder {
24+
if reg == nil {
25+
reg = prometheus.DefaultRegisterer
26+
}
27+
28+
r := &Recorder{
29+
reg: reg,
30+
31+
storagePromCacheLatency: prometheus.NewHistogramVec(
32+
prometheus.HistogramOpts{
33+
Namespace: Prefix,
34+
Subsystem: "storage_prometheus",
35+
Name: "cache_background_refresh_duration_seconds",
36+
Help: "Duration histogram of Prometheus storage cache refresh operations.",
37+
Buckets: prometheus.DefBuckets,
38+
},
39+
[]string{"success"},
40+
),
41+
42+
storageOperationLatency: prometheus.NewHistogramVec(
43+
prometheus.HistogramOpts{
44+
Namespace: Prefix,
45+
Subsystem: "storage",
46+
Name: "operation_duration_seconds",
47+
Help: "Duration histogram of storage operations.",
48+
Buckets: prometheus.DefBuckets,
49+
},
50+
[]string{"operation", "success"},
51+
),
52+
53+
promAPICliLatency: prometheus.NewHistogramVec(
54+
prometheus.HistogramOpts{
55+
Namespace: Prefix,
56+
Subsystem: "prometheus_api_client",
57+
Name: "operation_duration_seconds",
58+
Help: "Duration histogram of Prometheus API client operations.",
59+
Buckets: prometheus.DefBuckets,
60+
},
61+
[]string{"operation", "success"},
62+
),
63+
}
64+
65+
r.init()
66+
67+
return *r
68+
}
69+
70+
func (r Recorder) init() {
71+
// Register our collectors.
72+
r.reg.MustRegister(
73+
r.storagePromCacheLatency,
74+
r.promAPICliLatency,
75+
r.storageOperationLatency,
76+
)
77+
}
78+
79+
func (r Recorder) MeasurePrometheusStorageBackgroundCacheRefresh(ctx context.Context, t time.Duration, err error) {
80+
r.storagePromCacheLatency.WithLabelValues(strconv.FormatBool(err == nil)).Observe(t.Seconds())
81+
}
82+
83+
func (r Recorder) MeasurePrometheusAPIClientOperation(ctx context.Context, op string, t time.Duration, err error) {
84+
r.promAPICliLatency.WithLabelValues(op, strconv.FormatBool(err == nil)).Observe(t.Seconds())
85+
}
86+
87+
func (r Recorder) MeasureStorageOperationDuration(ctx context.Context, op string, t time.Duration, err error) {
88+
r.storageOperationLatency.WithLabelValues(op, strconv.FormatBool(err == nil)).Observe(t.Seconds())
89+
}
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
package prometheus_test
2+
3+
import (
4+
"fmt"
5+
"strings"
6+
"testing"
7+
"time"
8+
9+
"github.com/prometheus/client_golang/prometheus"
10+
"github.com/prometheus/client_golang/prometheus/testutil"
11+
"github.com/stretchr/testify/assert"
12+
13+
metricsprometheus "github.com/slok/sloth/internal/http/backend/metrics/prometheus"
14+
)
15+
16+
func TestPrometheusMetricsRecorder(t *testing.T) {
17+
tests := map[string]struct {
18+
measure func(t *testing.T, r metricsprometheus.Recorder)
19+
expMetrics string
20+
}{
21+
"Measuring Prometheus Storage Background Cache Refresh should measure correctly.": {
22+
measure: func(t *testing.T, r metricsprometheus.Recorder) {
23+
r.MeasurePrometheusStorageBackgroundCacheRefresh(t.Context(), 1500*time.Millisecond, nil)
24+
r.MeasurePrometheusStorageBackgroundCacheRefresh(t.Context(), 500*time.Millisecond, nil)
25+
r.MeasurePrometheusStorageBackgroundCacheRefresh(t.Context(), 2500*time.Millisecond, fmt.Errorf("some error"))
26+
},
27+
expMetrics: `
28+
# HELP sloth_storage_prometheus_cache_background_refresh_duration_seconds Duration histogram of Prometheus storage cache refresh operations.
29+
# TYPE sloth_storage_prometheus_cache_background_refresh_duration_seconds histogram
30+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="0.005"} 0
31+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="0.01"} 0
32+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="0.025"} 0
33+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="0.05"} 0
34+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="0.1"} 0
35+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="0.25"} 0
36+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="0.5"} 0
37+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="1"} 0
38+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="2.5"} 1
39+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="5"} 1
40+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="10"} 1
41+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="false",le="+Inf"} 1
42+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_sum{success="false"} 2.5
43+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_count{success="false"} 1
44+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="0.005"} 0
45+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="0.01"} 0
46+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="0.025"} 0
47+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="0.05"} 0
48+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="0.1"} 0
49+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="0.25"} 0
50+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="0.5"} 1
51+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="1"} 1
52+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="2.5"} 2
53+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="5"} 2
54+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="10"} 2
55+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_bucket{success="true",le="+Inf"} 2
56+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_sum{success="true"} 2
57+
sloth_storage_prometheus_cache_background_refresh_duration_seconds_count{success="true"} 2
58+
`,
59+
},
60+
61+
"Measuring Prometheus API Client Operation should measure correctly.": {
62+
measure: func(t *testing.T, r metricsprometheus.Recorder) {
63+
r.MeasurePrometheusAPIClientOperation(t.Context(), "Query", 1200*time.Millisecond, nil)
64+
r.MeasurePrometheusAPIClientOperation(t.Context(), "Query", 800*time.Millisecond, nil)
65+
r.MeasurePrometheusAPIClientOperation(t.Context(), "QueryRange", 3000*time.Millisecond, fmt.Errorf("some error"))
66+
},
67+
expMetrics: `
68+
# HELP sloth_prometheus_api_client_operation_duration_seconds Duration histogram of Prometheus API client operations.
69+
# TYPE sloth_prometheus_api_client_operation_duration_seconds histogram
70+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="0.005"} 0
71+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="0.01"} 0
72+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="0.025"} 0
73+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="0.05"} 0
74+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="0.1"} 0
75+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="0.25"} 0
76+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="0.5"} 0
77+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="1"} 1
78+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="2.5"} 2
79+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="5"} 2
80+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="10"} 2
81+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="Query",success="true",le="+Inf"} 2
82+
sloth_prometheus_api_client_operation_duration_seconds_sum{operation="Query",success="true"} 2
83+
sloth_prometheus_api_client_operation_duration_seconds_count{operation="Query",success="true"} 2
84+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="0.005"} 0
85+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="0.01"} 0
86+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="0.025"} 0
87+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="0.05"} 0
88+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="0.1"} 0
89+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="0.25"} 0
90+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="0.5"} 0
91+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="1"} 0
92+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="2.5"} 0
93+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="5"} 1
94+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="10"} 1
95+
sloth_prometheus_api_client_operation_duration_seconds_bucket{operation="QueryRange",success="false",le="+Inf"} 1
96+
sloth_prometheus_api_client_operation_duration_seconds_sum{operation="QueryRange",success="false"} 3
97+
sloth_prometheus_api_client_operation_duration_seconds_count{operation="QueryRange",success="false"} 1
98+
`,
99+
},
100+
"Measuring Storage Operation Duration should measure correctly.": {
101+
measure: func(t *testing.T, r metricsprometheus.Recorder) {
102+
r.MeasureStorageOperationDuration(t.Context(), "GetSLOs", 700*time.Millisecond, nil)
103+
r.MeasureStorageOperationDuration(t.Context(), "GetSLOs", 400*time.Millisecond, nil)
104+
r.MeasureStorageOperationDuration(t.Context(), "GetServices", 2000*time.Millisecond, fmt.Errorf("some error"))
105+
},
106+
expMetrics: `
107+
# HELP sloth_storage_operation_duration_seconds Duration histogram of storage operations.
108+
# TYPE sloth_storage_operation_duration_seconds histogram
109+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="0.005"} 0
110+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="0.01"} 0
111+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="0.025"} 0
112+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="0.05"} 0
113+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="0.1"} 0
114+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="0.25"} 0
115+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="0.5"} 1
116+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="1"} 2
117+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="2.5"} 2
118+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="5"} 2
119+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="10"} 2
120+
sloth_storage_operation_duration_seconds_bucket{operation="GetSLOs",success="true",le="+Inf"} 2
121+
sloth_storage_operation_duration_seconds_sum{operation="GetSLOs",success="true"} 1.1
122+
sloth_storage_operation_duration_seconds_count{operation="GetSLOs",success="true"} 2
123+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="0.005"} 0
124+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="0.01"} 0
125+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="0.025"} 0
126+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="0.05"} 0
127+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="0.1"} 0
128+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="0.25"} 0
129+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="0.5"} 0
130+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="1"} 0
131+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="2.5"} 1
132+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="5"} 1
133+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="10"} 1
134+
sloth_storage_operation_duration_seconds_bucket{operation="GetServices",success="false",le="+Inf"} 1
135+
sloth_storage_operation_duration_seconds_sum{operation="GetServices",success="false"} 2
136+
sloth_storage_operation_duration_seconds_count{operation="GetServices",success="false"} 1
137+
`,
138+
},
139+
}
140+
141+
for name, test := range tests {
142+
t.Run(name, func(t *testing.T) {
143+
assert := assert.New(t)
144+
145+
reg := prometheus.NewRegistry()
146+
rec := metricsprometheus.NewRecorder(reg)
147+
148+
test.measure(t, rec)
149+
150+
// Check metrics.
151+
err := testutil.GatherAndCompare(reg, strings.NewReader(test.expMetrics))
152+
assert.NoError(err)
153+
})
154+
}
155+
}

internal/http/backend/storage/prometheus/cache.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,13 @@ type cache struct {
2626
SLOGroupingLabelsBySlothID map[string]map[string]struct{}
2727
}
2828

29-
func (r *Repository) refreshCaches(ctx context.Context) error {
29+
func (r *Repository) refreshCaches(ctx context.Context) (err error) {
3030
r.logger.Debugf("Refreshing background Prometheus caches")
31+
start := time.Now()
32+
defer func() {
33+
duration := time.Since(start)
34+
r.metricsRecorder.MeasurePrometheusStorageBackgroundCacheRefresh(ctx, duration, err)
35+
}()
3136

3237
// Get information.
3338
chain := newSLOsInstantHydraterChain(
@@ -48,7 +53,7 @@ func (r *Repository) refreshCaches(ctx context.Context) error {
4853
slosBySlothID: make(map[string]*sloInstantData),
4954
slosBySLOID: make(map[string]*sloInstantData),
5055
}
51-
err := chain.HydrateSLOInstant(ctx, slos)
56+
err = chain.HydrateSLOInstant(ctx, slos)
5257
if err != nil {
5358
return fmt.Errorf("could not hydrate slo instant data: %w", err)
5459
}

0 commit comments

Comments
 (0)