@@ -16,22 +16,48 @@ var (
16
16
)
17
17
18
18
type latencyCollector struct {
19
- latency cmetrics
20
- ops cmetrics
19
+ latency cmetrics
20
+ latencyHistogram cmetrics
21
+ ops cmetrics
22
+ histOps cmetrics
23
+ bucketSum cmetrics
21
24
}
22
25
23
26
func newLatencyCollector () latencyCollector {
24
27
lc := latencyCollector {
25
- latency : map [string ]cmetric {},
26
- ops : map [string ]cmetric {},
28
+ latency : map [string ]cmetric {},
29
+ latencyHistogram : map [string ]cmetric {},
30
+ ops : map [string ]cmetric {},
31
+ histOps : map [string ]cmetric {},
32
+ bucketSum : map [string ]cmetric {},
27
33
}
28
34
for _ , m := range latencyMetrics {
29
35
lc .latency [m ] = cmetric {
30
36
typ : prometheus .GaugeValue ,
31
37
desc : prometheus .NewDesc (
32
38
promkey (systemLatency , m ),
39
+ m + " latency" ,
40
+ []string {"namespace" , "threshold" }, // threshold to be printed as le for histogram
41
+ nil ,
42
+ ),
43
+ }
44
+ lc .latencyHistogram [m ] = cmetric {
45
+ typ : prometheus .GaugeValue ,
46
+ desc : prometheus .NewDesc (
47
+ // for prom histogram latency buckets, metric must end in _bucket
48
+ promkey (systemLatencyHist , m + "_bucket" ),
33
49
m + " latency histogram" ,
34
- []string {"namespace" , "threshold" },
50
+ []string {"namespace" , "le" }, // threshold to be printed as le for histogram, le="1" means ops that completed in less than 1ms
51
+ nil ,
52
+ ),
53
+ }
54
+ lc .histOps [m ] = cmetric {
55
+ typ : prometheus .GaugeValue ,
56
+ desc : prometheus .NewDesc (
57
+ // for prom histogram, must have a metric ending in _count which is equal to the sum of all observed events
58
+ promkey (systemLatencyHist , m + "_count" ),
59
+ m + " ops per second for histogram" ,
60
+ []string {"namespace" },
35
61
nil ,
36
62
),
37
63
}
@@ -44,6 +70,16 @@ func newLatencyCollector() latencyCollector {
44
70
nil ,
45
71
),
46
72
}
73
+ lc .bucketSum [m ] = cmetric {
74
+ typ : prometheus .GaugeValue ,
75
+ desc : prometheus .NewDesc (
76
+ // for prom histogram, must have a metric ending in _sum which is equal to the sum of all observed events values
77
+ promkey (systemLatencyHist , m + "_sum" ),
78
+ m + " sum of all buckets" ,
79
+ []string {"namespace" },
80
+ nil ,
81
+ ),
82
+ }
47
83
}
48
84
return lc
49
85
}
@@ -55,6 +91,15 @@ func (lc latencyCollector) describe(ch chan<- *prometheus.Desc) {
55
91
for _ , s := range lc .ops {
56
92
ch <- s .desc
57
93
}
94
+ for _ , s := range lc .histOps {
95
+ ch <- s .desc
96
+ }
97
+ for _ , s := range lc .bucketSum {
98
+ ch <- s .desc
99
+ }
100
+ for _ , s := range lc .latencyHistogram {
101
+ ch <- s .desc
102
+ }
58
103
}
59
104
60
105
func (lc latencyCollector ) collect (conn * as.Connection ) ([]prometheus.Metric , error ) {
@@ -67,6 +112,8 @@ func (lc latencyCollector) collect(conn *as.Connection) ([]prometheus.Metric, er
67
112
return nil , err
68
113
}
69
114
var metrics []prometheus.Metric
115
+ re := regexp .MustCompile ("[0-9.]+" ) // regex to pull the number from the bucket name, >1ms -> 1, >8ms -> 8 etc.
116
+
70
117
for key , ms := range lat {
71
118
if key == "batch-index" {
72
119
continue // TODO: would be nice to do something with this key
@@ -75,21 +122,61 @@ func (lc latencyCollector) collect(conn *as.Connection) ([]prometheus.Metric, er
75
122
if err != nil {
76
123
return nil , fmt .Errorf ("weird latency key %q: %s" , key , err )
77
124
}
125
+ // need to grab ops outside of the latency loop
126
+ // so that we can use it for estimatedBucketOps later
127
+ // the latency map could be out of order, so OPS/S needs to be accessed first
128
+ ops := ms ["ops/sec" ]
129
+ var bucketSum float64
130
+ histOpsMetric := lc .histOps [op ]
131
+ metrics = append (
132
+ metrics ,
133
+ prometheus .MustNewConstMetric (histOpsMetric .desc , histOpsMetric .typ , ops , ns ),
134
+ )
135
+
136
+ opsMetric := lc .ops [op ]
137
+ metrics = append (
138
+ metrics ,
139
+ prometheus .MustNewConstMetric (opsMetric .desc , opsMetric .typ , ops , ns ),
140
+ )
141
+
78
142
for threshold , data := range ms {
79
143
if threshold == "ops/sec" {
80
- m := lc .ops [op ]
81
- metrics = append (
82
- metrics ,
83
- prometheus .MustNewConstMetric (m .desc , m .typ , data , ns ),
84
- )
85
144
continue
86
145
}
87
- m := lc .latency [op ]
146
+ thresholdNum := re .FindString (threshold ) // filter out >1ms to just the number 1, similarly >8ms becomes 8..
147
+ bucketVal , _ := strconv .ParseFloat (thresholdNum , 64 )
148
+ m := lc .latencyHistogram [op ]
149
+ // latency is exported as % in certain buckets.
150
+ // For histogram consumption, it would be nice to have the estimated number of
151
+ // operations in each bucket instead. So this is just some simple math to figure out, given the total ops/s and % in each bucket
152
+ // How many ops in each bucket.
153
+ estimatedBucketOps := ops * data / 100.0
154
+ bucketSum += (bucketVal * estimatedBucketOps ) // to generate sum like aerospike_latency_hist_read_sum
155
+ if bucketVal == 1.0 {
156
+ below1msAssumption := ops - estimatedBucketOps
157
+ bucketSum += 0.5 * below1msAssumption // for the blahblah_sum histogram metric, to calculate averages, assume the transactions <1ms are .5ms
158
+ }
159
+ leBucketOps := ops - estimatedBucketOps
160
+ metrics = append (
161
+ metrics ,
162
+ prometheus .MustNewConstMetric (m .desc , m .typ , leBucketOps , ns , thresholdNum ),
163
+ )
164
+ m = lc .latency [op ]
88
165
metrics = append (
89
166
metrics ,
90
167
prometheus .MustNewConstMetric (m .desc , m .typ , data , ns , threshold ),
91
168
)
92
169
}
170
+ m := lc .bucketSum [op ]
171
+ metrics = append (
172
+ metrics ,
173
+ prometheus .MustNewConstMetric (m .desc , m .typ , bucketSum , ns ),
174
+ )
175
+ m = lc .latencyHistogram [op ]
176
+ metrics = append (
177
+ metrics ,
178
+ prometheus .MustNewConstMetric (m .desc , m .typ , ops , ns , "+Inf" ),
179
+ )
93
180
}
94
181
return metrics , nil
95
182
}
@@ -109,7 +196,6 @@ func parseLatency(lat string) (map[string]map[string]float64, error) {
109
196
vs := strings .Split (line , "," )
110
197
key := strings .SplitN (vs [0 ], ":" , 2 )[0 ] // strips timestamp
111
198
cols := vs [1 :]
112
-
113
199
if i + 1 >= len (lines ) {
114
200
return nil , fmt .Errorf ("latency: missing measurements line" )
115
201
}
0 commit comments