Fix off-by-one in query/cost

prymitive · prymitive · commit 312def6b1f77 · 2025-07-28T14:11:04.000+01:00
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## v0.74.5
+
+### Fixed
+
+- Fixed incorrect incorrect suggestions generated by [query/cost](checks/query/cost.md) check.
+
 ## v0.74.4
 
 ### Fixed
diff --git a/internal/checks/query_cost.go b/internal/checks/query_cost.go
@@ -3,6 +3,7 @@ package checks
 import (
 	"context"
 	"fmt"
+	"log/slog"
 	"math"
 	"net/url"
 	"strings"
@@ -85,6 +86,7 @@ func (c CostCheck) Check(ctx context.Context, entry discovery.Entry, entries []d
 		return problems
 	}
 
+	slog.Debug("Calculating cost of the raw query", slog.String("expr", expr.Value.Value))
 	qr, series, err := c.getQueryCost(ctx, expr.Value.Value)
 	if err != nil {
 		problems = append(problems, problemFromError(err, entry.Rule, c.Reporter(), c.prom.Name(), Bug))
@@ -266,6 +268,7 @@ func (c CostCheck) suggestRecordingRules(
 
 					sq := c.rewriteRuleFragment(expr.Value.Value, op.PositionRange(), other.Rule.RecordingRule.Record.Value+extra)
 					var details strings.Builder
+					slog.Debug("Calculating cost of the new query", slog.String("expr", sq))
 					qr, afterSeries, err := c.getQueryCost(ctx, sq)
 					if err == nil {
 						if qr.Stats.Samples.TotalQueryableSamples >= beforeStats.Samples.TotalQueryableSamples &&
@@ -343,7 +346,7 @@ func (c CostCheck) rewriteRuleFragment(expr string, fragment posrange.PositionRa
 		buf.WriteString(expr[:int(fragment.Start)])
 	}
 	buf.WriteString(replacement)
-	if int(fragment.End)+1 < len(expr) {
+	if int(fragment.End) < len(expr) {
 		buf.WriteString(expr[int(fragment.End):])
 	}
 	return buf.String()
@@ -354,18 +357,25 @@ func (c CostCheck) diffStatsInt(a, b int) string {
 	if delta == 0 || math.IsNaN(delta) {
 		return fmt.Sprintf("%d (no change)", a)
 	}
-	return fmt.Sprintf("%d instead of %d (%0.2f%%)", b, a, delta)
+	return fmt.Sprintf("%d instead of %d (%s%%)", b, a, formatDelta(delta))
 }
 
 func (c CostCheck) diffStatsDuration(a, b float64) string {
 	delta := ((b - a) / a) * 100
 	if delta == 0 || math.IsNaN(delta) {
 		return output.HumanizeDuration(c.statToDuration(a)) + " (no change)"
 	}
-	return fmt.Sprintf("%s instead of %s (%0.2f%%)",
+	return fmt.Sprintf("%s instead of %s (%s%%)",
 		output.HumanizeDuration(c.statToDuration(b)),
 		output.HumanizeDuration(c.statToDuration(a)),
-		delta)
+		formatDelta(delta))
+}
+
+func formatDelta(delta float64) string {
+	if delta <= 0 {
+		return fmt.Sprintf("%0.2f", delta)
+	}
+	return fmt.Sprintf("+%0.2f", delta)
 }
 
 func (c CostCheck) isSuggestionFor(src, potential utils.Source, join *utils.Join, unless *utils.Unless) (promParser.Node, string, bool, bool) {
diff --git a/internal/checks/query_cost_test.go b/internal/checks/query_cost_test.go
@@ -1375,6 +1375,75 @@ func TestCostCheck(t *testing.T) {
 				},
 			},
 		},
+		{
+			description: "suggest recording rule / complex",
+			content: `- record: instance_job:fl2_hmd_request_phase_latency_30ms_good:rate5m
+  expr: sum without (le) (histogram_fraction(0, 0.03, rate(fl2_request_phase_duration_seconds[5m])) * histogram_count(rate(fl2_request_phase_duration_seconds[5m])))
+`,
+			checker: func(prom *promapi.FailoverGroup) checks.RuleChecker {
+				return checks.NewCostCheck(prom, 100, 100, 0, 0, "check comment", checks.Warning)
+			},
+			prometheus: newSimpleProm,
+			entries: mustParseContent(`
+
+- record: instance_job:fl2_hmd_request_phase_latency_count:rate5m
+  expr: histogram_count(rate(fl2_request_phase_duration_seconds[5m]))
+`),
+			problems: true,
+			mocks: []*prometheusMock{
+				{
+					conds: []requestCondition{
+						requireQueryPath,
+						formCond{key: "query", value: "count(sum without (le) (histogram_fraction(0, 0.03, rate(fl2_request_phase_duration_seconds[5m])) * histogram_count(rate(fl2_request_phase_duration_seconds[5m]))))"},
+					},
+					resp: vectorResponse{
+						samples: []*model.Sample{
+							generateSample(map[string]string{}),
+						},
+						stats: promapi.QueryStats{
+							Samples: promapi.QuerySamples{
+								TotalQueryableSamples: 50,
+								PeakSamples:           50,
+							},
+							Timings: promapi.QueryTimings{
+								EvalTotalTime: 10,
+							},
+						},
+					},
+				},
+				{
+					conds: []requestCondition{
+						requireQueryPath,
+						formCond{key: "query", value: "count(sum without (le) (histogram_fraction(0, 0.03, rate(fl2_request_phase_duration_seconds[5m])) * instance_job:fl2_hmd_request_phase_latency_count:rate5m))"},
+					},
+					resp: vectorResponse{
+						samples: []*model.Sample{
+							generateSample(map[string]string{}),
+						},
+						stats: promapi.QueryStats{
+							Samples: promapi.QuerySamples{
+								TotalQueryableSamples: 30,
+								PeakSamples:           30,
+							},
+							Timings: promapi.QueryTimings{
+								EvalTotalTime: 11,
+							},
+						},
+					},
+				},
+				{
+					conds: []requestCondition{
+						requireQueryPath,
+						formCond{key: "query", value: checks.BytesPerSampleQuery},
+					},
+					resp: vectorResponse{
+						samples: []*model.Sample{
+							generateSampleWithValue(map[string]string{}, 2048),
+						},
+					},
+				},
+			},
+		},
 	}
 
 	runTests(t, testCases)
diff --git a/internal/checks/query_cost_test.snap b/internal/checks/query_cost_test.snap
@@ -442,6 +442,52 @@
 
 ---
 
+[TestCostCheck/suggest_recording_rule_/_complex - 1]
+- description: suggest recording rule / complex
+  content: |
+    - record: instance_job:fl2_hmd_request_phase_latency_30ms_good:rate5m
+      expr: sum without (le) (histogram_fraction(0, 0.03, rate(fl2_request_phase_duration_seconds[5m])) * histogram_count(rate(fl2_request_phase_duration_seconds[5m])))
+  output: |
+    2 |   expr: sum without (le) (histogram_fraction(0, 0.03, rate(fl2_request_phase_duration_seconds[5m])) * histogram_count(rate(fl2_request_phase_duration_seconds[5m])))
+                                                                                                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Use `instance_job:fl2_hmd_request_phase_latency_count:rate5m` here instead to speed up the query.
+  problem:
+    reporter: query/cost
+    summary: query could use a recording rule
+    details: |
+        There is a recording rule that already stores the result of this query, use it here to speed up this query.
+
+        ```yaml
+        - record: instance_job:fl2_hmd_request_phase_latency_count:rate5m
+          expr: histogram_count(rate(fl2_request_phase_duration_seconds[5m]))
+        ```
+
+        Using `instance_job:fl2_hmd_request_phase_latency_count:rate5m` rule would speed up this query:
+
+        - Total queried samples would be 30 instead of 50 (-40.00%)
+        - Peak queried samples would be 30 instead of 50 (-40.00%)
+        - Query evaluation time would be 11s instead of 10s (+10.00%)
+
+        To get results for both original and suggested query click below:
+
+        - [Original query](https://simple.example.com/graph?g0.expr=sum+without+%28le%29+%28histogram_fraction%280%2C+0.03%2C+rate%28fl2_request_phase_duration_seconds%5B5m%5D%29%29+%2A+histogram_count%28rate%28fl2_request_phase_duration_seconds%5B5m%5D%29%29%29&g0.tab=table)
+        - [Suggested query](https://simple.example.com/graph?g0.expr=sum+without+%28le%29+%28histogram_fraction%280%2C+0.03%2C+rate%28fl2_request_phase_duration_seconds%5B5m%5D%29%29+%2A+instance_job%3Afl2_hmd_request_phase_latency_count%3Arate5m%29&g0.tab=table)
+    diagnostics:
+        - message: Use `instance_job:fl2_hmd_request_phase_latency_count:rate5m` here instead to speed up the query.
+          pos:
+            - line: 2
+              firstcolumn: 9
+              lastcolumn: 164
+          firstcolumn: 95
+          lastcolumn: 155
+          kind: 0
+    lines:
+        first: 2
+        last: 2
+    severity: 0
+    anchor: 0
+
+---
+
 [TestCostCheck/suggest_recording_rule_/_ignore_multi-source - 1]
 []