Skip to content

Commit 9b785b0

Browse files
mknyszekkakkoyun
authored andcommitted
Reduce granularity of histogram buckets for Go 1.17 collector (#974)
The Go runtime/metrics package currently exports extremely granular histograms. Exponentially bucket any histogram with unit "seconds" or "bytes" instead to dramatically reduce the number of buckets, and thus the number of metrics. This change also adds a test to check for expected cardinality to prevent cardinality surprises in the future. Signed-off-by: Michael Anthony Knyszek <[email protected]>
1 parent 5a529ae commit 9b785b0

5 files changed

+205
-27
lines changed

prometheus/gen_go_collector_metrics_set.go

+50
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ import (
2121
"fmt"
2222
"go/format"
2323
"log"
24+
"math"
2425
"os"
26+
"runtime"
2527
"runtime/metrics"
2628
"strconv"
2729
"strings"
@@ -35,6 +37,10 @@ func main() {
3537
if len(os.Args) != 2 {
3638
log.Fatal("requires Go version (e.g. go1.17) as an argument")
3739
}
40+
toolVersion := runtime.Version()
41+
if majorVersion := toolVersion[:strings.LastIndexByte(toolVersion, '.')]; majorVersion != os.Args[1] {
42+
log.Fatalf("using Go version %q but expected Go version %q", majorVersion, os.Args[1])
43+
}
3844
version, err := parseVersion(os.Args[1])
3945
if err != nil {
4046
log.Fatalf("parsing Go version: %v", err)
@@ -45,9 +51,11 @@ func main() {
4551
err = testFile.Execute(&buf, struct {
4652
Descriptions []metrics.Description
4753
GoVersion goVersion
54+
Cardinality int
4855
}{
4956
Descriptions: metrics.All(),
5057
GoVersion: version,
58+
Cardinality: rmCardinality(),
5159
})
5260
if err != nil {
5361
log.Fatalf("executing template: %v", err)
@@ -85,6 +93,46 @@ func parseVersion(s string) (goVersion, error) {
8593
return goVersion(i), err
8694
}
8795

96+
func rmCardinality() int {
97+
cardinality := 0
98+
99+
// Collect all histogram samples so that we can get their buckets.
100+
// The API guarantees that the buckets are always fixed for the lifetime
101+
// of the process.
102+
var histograms []metrics.Sample
103+
for _, d := range metrics.All() {
104+
if d.Kind == metrics.KindFloat64Histogram {
105+
histograms = append(histograms, metrics.Sample{Name: d.Name})
106+
} else {
107+
cardinality++
108+
}
109+
}
110+
111+
// Handle histograms.
112+
metrics.Read(histograms)
113+
for i := range histograms {
114+
name := histograms[i].Name
115+
buckets := internal.RuntimeMetricsBucketsForUnit(
116+
histograms[i].Value.Float64Histogram().Buckets,
117+
name[strings.IndexRune(name, ':')+1:],
118+
)
119+
cardinality += len(buckets) + 3 // Plus total count, sum, and the implicit infinity bucket.
120+
// runtime/metrics bucket boundaries are lower-bound-inclusive, but
121+
// always represents each actual *boundary* so Buckets is always
122+
// 1 longer than Counts, while in Prometheus the mapping is one-to-one,
123+
// as the bottom bucket extends to -Inf, and the top infinity bucket is
124+
// implicit. Therefore, we should have one fewer bucket than is listed
125+
// above.
126+
cardinality--
127+
if buckets[len(buckets)-1] == math.Inf(1) {
128+
// We already counted the infinity bucket separately.
129+
cardinality--
130+
}
131+
}
132+
133+
return cardinality
134+
}
135+
88136
var testFile = template.Must(template.New("testFile").Funcs(map[string]interface{}{
89137
"rm2prom": func(d metrics.Description) string {
90138
ns, ss, n, ok := internal.RuntimeMetricsToProm(&d)
@@ -112,4 +160,6 @@ var expectedRuntimeMetrics = map[string]string{
112160
{{- end -}}
113161
{{end}}
114162
}
163+
164+
const expectedRuntimeMetricsCardinality = {{.Cardinality}}
115165
`))

prometheus/go_collector_go117.go

+46-21
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"math"
2121
"runtime"
2222
"runtime/metrics"
23+
"strings"
2324
"sync"
2425

2526
//nolint:staticcheck // Ignore SA1019. Need to keep deprecated package for compatibility.
@@ -56,9 +57,20 @@ type goCollector struct {
5657
// Deprecated: Use collectors.NewGoCollector instead.
5758
func NewGoCollector() Collector {
5859
descriptions := metrics.All()
59-
descMap := make(map[string]*metrics.Description)
60-
for i := range descriptions {
61-
descMap[descriptions[i].Name] = &descriptions[i]
60+
61+
// Collect all histogram samples so that we can get their buckets.
62+
// The API guarantees that the buckets are always fixed for the lifetime
63+
// of the process.
64+
var histograms []metrics.Sample
65+
for _, d := range descriptions {
66+
if d.Kind == metrics.KindFloat64Histogram {
67+
histograms = append(histograms, metrics.Sample{Name: d.Name})
68+
}
69+
}
70+
metrics.Read(histograms)
71+
bucketsMap := make(map[string][]float64)
72+
for i := range histograms {
73+
bucketsMap[histograms[i].Name] = histograms[i].Value.Float64Histogram().Buckets
6274
}
6375

6476
// Generate a Desc and ValueType for each runtime/metrics metric.
@@ -83,13 +95,15 @@ func NewGoCollector() Collector {
8395
var m collectorMetric
8496
if d.Kind == metrics.KindFloat64Histogram {
8597
_, hasSum := rmExactSumMap[d.Name]
98+
unit := d.Name[strings.IndexRune(d.Name, ':')+1:]
8699
m = newBatchHistogram(
87100
NewDesc(
88101
BuildFQName(namespace, subsystem, name),
89102
d.Description,
90103
nil,
91104
nil,
92105
),
106+
internal.RuntimeMetricsBucketsForUnit(bucketsMap[d.Name], unit),
93107
hasSum,
94108
)
95109
} else if d.Cumulative {
@@ -299,42 +313,53 @@ type batchHistogram struct {
299313
// but Write calls may operate concurrently with updates.
300314
// Contention between these two sources should be rare.
301315
mu sync.Mutex
302-
buckets []float64 // Inclusive lower bounds.
316+
buckets []float64 // Inclusive lower bounds, like runtime/metrics.
303317
counts []uint64
304318
sum float64 // Used if hasSum is true.
305319
}
306320

307-
func newBatchHistogram(desc *Desc, hasSum bool) *batchHistogram {
308-
h := &batchHistogram{desc: desc, hasSum: hasSum}
321+
// newBatchHistogram creates a new batch histogram value with the given
322+
// Desc, buckets, and whether or not it has an exact sum available.
323+
//
324+
// buckets must always be from the runtime/metrics package, following
325+
// the same conventions.
326+
func newBatchHistogram(desc *Desc, buckets []float64, hasSum bool) *batchHistogram {
327+
h := &batchHistogram{
328+
desc: desc,
329+
buckets: buckets,
330+
// Because buckets follows runtime/metrics conventions, there's
331+
// 1 more value in the buckets list than there are buckets represented,
332+
// because in runtime/metrics, the bucket values represent *boundaries*,
333+
// and non-Inf boundaries are inclusive lower bounds for that bucket.
334+
counts: make([]uint64, len(buckets)-1),
335+
hasSum: hasSum,
336+
}
309337
h.init(h)
310338
return h
311339
}
312340

313341
// update updates the batchHistogram from a runtime/metrics histogram.
314342
//
315343
// sum must be provided if the batchHistogram was created to have an exact sum.
344+
// h.buckets must be a strict subset of his.Buckets.
316345
func (h *batchHistogram) update(his *metrics.Float64Histogram, sum float64) {
317346
counts, buckets := his.Counts, his.Buckets
318-
// Skip a -Inf bucket altogether. It's not clear how to represent that.
319-
if math.IsInf(buckets[0], -1) {
320-
buckets = buckets[1:]
321-
counts = counts[1:]
322-
}
323347

324348
h.mu.Lock()
325349
defer h.mu.Unlock()
326350

327-
// Check if we're initialized.
328-
if h.buckets == nil {
329-
// Make copies of counts and buckets. It's really important
330-
// that we don't retain his.Counts or his.Buckets anywhere since
331-
// it's going to get reused.
332-
h.buckets = make([]float64, len(buckets))
333-
copy(h.buckets, buckets)
334-
335-
h.counts = make([]uint64, len(counts))
351+
// Clear buckets.
352+
for i := range h.counts {
353+
h.counts[i] = 0
354+
}
355+
// Copy and reduce buckets.
356+
var j int
357+
for i, count := range counts {
358+
h.counts[j] += count
359+
if buckets[i+1] == h.buckets[j+1] {
360+
j++
361+
}
336362
}
337-
copy(h.counts, counts)
338363
if h.hasSum {
339364
h.sum = sum
340365
}

prometheus/go_collector_go117_test.go

+42-6
Original file line numberDiff line numberDiff line change
@@ -140,9 +140,13 @@ func TestBatchHistogram(t *testing.T) {
140140
}
141141
metrics.Read(s)
142142
rmHist := s[0].Value.Float64Histogram()
143-
// runtime/metrics histograms always have -Inf and +Inf buckets.
144-
// We never handle -Inf and +Inf is implicit.
145-
wantBuckets := len(rmHist.Buckets) - 2
143+
wantBuckets := internal.RuntimeMetricsBucketsForUnit(rmHist.Buckets, "bytes")
144+
// runtime/metrics histograms always have a +Inf bucket and are lower
145+
// bound inclusive. In contrast, we have an implicit +Inf bucket and
146+
// are upper bound inclusive, so we can chop off the first bucket
147+
// (since the conversion to upper bound inclusive will shift all buckets
148+
// down one index) and the +Inf for the last bucket.
149+
wantBuckets = wantBuckets[1 : len(wantBuckets)-1]
146150

147151
// Check to make sure the output proto makes sense.
148152
pb := &dto.Metric{}
@@ -151,14 +155,14 @@ func TestBatchHistogram(t *testing.T) {
151155
if math.IsInf(pb.Histogram.Bucket[len(pb.Histogram.Bucket)-1].GetUpperBound(), +1) {
152156
t.Errorf("found +Inf bucket")
153157
}
154-
if got := len(pb.Histogram.Bucket); got != wantBuckets {
155-
t.Errorf("got %d buckets in protobuf, want %d", got, wantBuckets)
158+
if got := len(pb.Histogram.Bucket); got != len(wantBuckets) {
159+
t.Errorf("got %d buckets in protobuf, want %d", got, len(wantBuckets))
156160
}
157161
for i, bucket := range pb.Histogram.Bucket {
158162
// runtime/metrics histograms are lower-bound inclusive, but we're
159163
// upper-bound inclusive. So just make sure the new inclusive upper
160164
// bound is somewhere close by (in some cases it's equal).
161-
wantBound := rmHist.Buckets[i+1]
165+
wantBound := wantBuckets[i]
162166
if gotBound := *bucket.UpperBound; (wantBound-gotBound)/wantBound > 0.001 {
163167
t.Errorf("got bound %f, want within 0.1%% of %f", gotBound, wantBound)
164168
}
@@ -244,6 +248,7 @@ func TestExpectedRuntimeMetrics(t *testing.T) {
244248

245249
descs := metrics.All()
246250
rmSet := make(map[string]struct{})
251+
// Iterate over runtime-reported descriptions to find new metrics.
247252
for i := range descs {
248253
rmName := descs[i].Name
249254
rmSet[rmName] = struct{}{}
@@ -263,6 +268,8 @@ func TestExpectedRuntimeMetrics(t *testing.T) {
263268
continue
264269
}
265270
}
271+
// Now iterate over the expected metrics and look for removals.
272+
cardinality := 0
266273
for rmName, fqName := range expectedRuntimeMetrics {
267274
if _, ok := rmSet[rmName]; !ok {
268275
t.Errorf("runtime/metrics metric %s removed", rmName)
@@ -272,13 +279,42 @@ func TestExpectedRuntimeMetrics(t *testing.T) {
272279
t.Errorf("runtime/metrics metric %s not appearing under expected name %s", rmName, fqName)
273280
continue
274281
}
282+
283+
// While we're at it, check to make sure expected cardinality lines
284+
// up, but at the point of the protobuf write to get as close to the
285+
// real deal as possible.
286+
//
287+
// Note that we filter out non-runtime/metrics metrics here, because
288+
// those are manually managed.
289+
var m dto.Metric
290+
if err := goMetricSet[fqName].Write(&m); err != nil {
291+
t.Errorf("writing metric %s: %v", fqName, err)
292+
continue
293+
}
294+
// N.B. These are the only fields populated by runtime/metrics metrics specifically.
295+
// Other fields are populated by e.g. GCStats metrics.
296+
switch {
297+
case m.Counter != nil:
298+
fallthrough
299+
case m.Gauge != nil:
300+
cardinality++
301+
case m.Histogram != nil:
302+
cardinality += len(m.Histogram.Bucket) + 3 // + sum, count, and +inf
303+
default:
304+
t.Errorf("unexpected protobuf structure for metric %s", fqName)
305+
}
275306
}
276307

277308
if t.Failed() {
278309
t.Log("a new Go version may have been detected, please run")
279310
t.Log("\tgo run gen_go_collector_metrics_set.go go1.X")
280311
t.Log("where X is the Go version you are currently using")
281312
}
313+
314+
expectCardinality := expectedRuntimeMetricsCardinality
315+
if cardinality != expectCardinality {
316+
t.Errorf("unexpected cardinality for runtime/metrics metrics: got %d, want %d", cardinality, expectCardinality)
317+
}
282318
}
283319

284320
func TestGoCollectorConcurrency(t *testing.T) {

prometheus/go_collector_metrics_go117_test.go

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

prometheus/internal/go_runtime_metrics.go

+65
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package internal
1818

1919
import (
20+
"math"
2021
"path"
2122
"runtime/metrics"
2223
"strings"
@@ -75,3 +76,67 @@ func RuntimeMetricsToProm(d *metrics.Description) (string, string, string, bool)
7576
}
7677
return namespace, subsystem, name, valid
7778
}
79+
80+
// RuntimeMetricsBucketsForUnit takes a set of buckets obtained for a runtime/metrics histogram
81+
// type (so, lower-bound inclusive) and a unit from a runtime/metrics name, and produces
82+
// a reduced set of buckets. This function always removes any -Inf bucket as it's represented
83+
// as the bottom-most upper-bound inclusive bucket in Prometheus.
84+
func RuntimeMetricsBucketsForUnit(buckets []float64, unit string) []float64 {
85+
switch unit {
86+
case "bytes":
87+
// Rebucket as powers of 2.
88+
return rebucketExp(buckets, 2)
89+
case "seconds":
90+
// Rebucket as powers of 10 and then merge all buckets greater
91+
// than 1 second into the +Inf bucket.
92+
b := rebucketExp(buckets, 10)
93+
for i := range b {
94+
if b[i] <= 1 {
95+
continue
96+
}
97+
b[i] = math.Inf(1)
98+
b = b[:i+1]
99+
break
100+
}
101+
return b
102+
}
103+
return buckets
104+
}
105+
106+
// rebucketExp takes a list of bucket boundaries (lower bound inclusive) and
107+
// downsamples the buckets to those a multiple of base apart. The end result
108+
// is a roughly exponential (in many cases, perfectly exponential) bucketing
109+
// scheme.
110+
func rebucketExp(buckets []float64, base float64) []float64 {
111+
bucket := buckets[0]
112+
var newBuckets []float64
113+
// We may see a -Inf here, in which case, add it and skip it
114+
// since we risk producing NaNs otherwise.
115+
//
116+
// We need to preserve -Inf values to maintain runtime/metrics
117+
// conventions. We'll strip it out later.
118+
if bucket == math.Inf(-1) {
119+
newBuckets = append(newBuckets, bucket)
120+
buckets = buckets[1:]
121+
bucket = buckets[0]
122+
}
123+
// From now on, bucket should always have a non-Inf value because
124+
// Infs are only ever at the ends of the bucket lists, so
125+
// arithmetic operations on it are non-NaN.
126+
for i := 1; i < len(buckets); i++ {
127+
if bucket >= 0 && buckets[i] < bucket*base {
128+
// The next bucket we want to include is at least bucket*base.
129+
continue
130+
} else if bucket < 0 && buckets[i] < bucket/base {
131+
// In this case the bucket we're targeting is negative, and since
132+
// we're ascending through buckets here, we need to divide to get
133+
// closer to zero exponentially.
134+
continue
135+
}
136+
// The +Inf bucket will always be the last one, and we'll always
137+
// end up including it here because bucket
138+
newBuckets = append(newBuckets, bucket)
139+
bucket = buckets[i]
140+
}
141+
return append(newBuckets, bucket)
142+
}

0 commit comments

Comments
 (0)