Skip to content

Commit 834cb2a

Browse files
authored
cherry-pick: fix: use counter instead of gauge for error metrics (#23522) (#23550)
1 parent 428e717 commit 834cb2a

File tree

8 files changed

+50
-118
lines changed

8 files changed

+50
-118
lines changed

Cargo.lock

Lines changed: 0 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docker/dashboards/risingwave-dev-dashboard.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

docker/dashboards/risingwave-user-dashboard.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

grafana/dashboard/dev/cluster_essential.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,26 @@ def _(outer_panels: Panels):
313313
"remote storage error {{type}}: {{%s}} @ {{%s}}"
314314
% (COMPONENT_LABEL, NODE_LABEL),
315315
),
316+
# We add a small constant 0.05 to make sure that the counter jumps from null to not-null,
317+
# the line will be flat at y=0.05 instead of disappearing.
318+
panels.target(
319+
f"sum(irate({metric('user_compute_error_cnt')}[$__rate_interval])) by (error_type, executor_name, fragment_id) or "
320+
+ f"sum({metric('user_compute_error_cnt')}) by (error_type, executor_name, fragment_id) * 0 + 0.05 "
321+
+ f"unless on({COMPONENT_LABEL}, {NODE_LABEL}) ((absent_over_time({metric('user_compute_error_cnt')}[20s])) > 0)",
322+
"{{error_type}} @ {{executor_name}} (fragment_id={{fragment_id}})",
323+
),
324+
panels.target(
325+
f"sum(irate({metric('user_source_error_cnt')}[$__rate_interval])) by (error_type, source_id, source_name, fragment_id) or "
326+
+ f"sum({metric('user_source_error_cnt')}) by (error_type, source_id, source_name, fragment_id) * 0 + 0.05 "
327+
+ f"unless on({COMPONENT_LABEL}, {NODE_LABEL}) ((absent_over_time({metric('user_source_error_cnt')}[20s])) > 0)",
328+
"{{error_type}} @ {{source_name}} (source_id={{source_id}} fragment_id={{fragment_id}})",
329+
),
330+
panels.target(
331+
f"sum(irate({metric('user_sink_error_cnt')}[$__rate_interval])) by (error_type, sink_id, sink_name, fragment_id) or "
332+
+ f"sum({metric('user_sink_error_cnt')}) by (error_type, sink_id, sink_name, fragment_id) * 0 + 0.05 "
333+
+ f"unless on({COMPONENT_LABEL}, {NODE_LABEL}) ((absent_over_time({metric('user_sink_error_cnt')}[20s])) > 0)",
334+
"{{error_type}} @ {{sink_name}} (sink_id={{sink_id}} fragment_id={{fragment_id}})",
335+
),
316336
],
317337
),
318338
panels.subheader("User Streaming Errors"),

grafana/risingwave-dev-dashboard.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

grafana/risingwave-user-dashboard.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

src/common/metrics/Cargo.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ itertools = { workspace = true }
2424
parking_lot = { workspace = true }
2525
pin-project-lite = { workspace = true }
2626
prometheus = { version = "0.14" }
27-
rw_iter_util = { workspace = true }
2827
rw_resource_util = { workspace = true }
2928
serde = { version = "1", features = ["derive"] }
3029
thiserror-ext = { workspace = true }

src/common/metrics/src/error_metrics.rs

Lines changed: 26 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -12,150 +12,64 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
use std::collections::HashMap;
16-
use std::sync::{Arc, LazyLock};
15+
use std::sync::LazyLock;
1716

18-
use itertools::Itertools;
19-
use parking_lot::Mutex;
20-
use prometheus::Registry;
21-
use prometheus::core::{Collector, Desc};
22-
use prometheus::proto::{Gauge, LabelPair, Metric, MetricFamily};
23-
use rw_iter_util::ZipEqFast;
17+
use prometheus::{IntCounterVec, Registry, register_int_counter_vec_with_registry};
2418

2519
use crate::monitor::GLOBAL_METRICS_REGISTRY;
2620

21+
#[derive(Clone)]
2722
pub struct ErrorMetric<const N: usize> {
28-
payload: Arc<Mutex<HashMap<[String; N], u32>>>,
29-
desc: Desc,
23+
inner: IntCounterVec,
3024
}
3125

3226
impl<const N: usize> ErrorMetric<N> {
33-
pub fn new(name: &str, help: &str, label_names: &[&str; N]) -> Self {
27+
pub fn new(name: &str, help: &str, label_names: &[&str; N], registry: &Registry) -> Self {
3428
Self {
35-
payload: Default::default(),
36-
desc: Desc::new(
37-
name.to_owned(),
38-
help.to_owned(),
39-
label_names.iter().map(|l| l.to_string()).collect_vec(),
40-
Default::default(),
41-
)
42-
.unwrap(),
29+
inner: register_int_counter_vec_with_registry!(name, help, label_names, registry)
30+
.unwrap(),
4331
}
4432
}
4533

4634
pub fn report(&self, labels: [String; N]) {
47-
let mut m = self.payload.lock();
48-
let v = m.entry(labels).or_default();
49-
*v += 1;
50-
}
51-
52-
fn collect(&self) -> MetricFamily {
53-
let mut m = MetricFamily::default();
54-
m.set_name(self.desc.fq_name.clone());
55-
m.set_help(self.desc.help.clone());
56-
m.set_field_type(prometheus::proto::MetricType::GAUGE);
57-
58-
let payload = self.payload.lock().drain().collect_vec();
59-
let mut metrics = Vec::with_capacity(payload.len());
60-
for (labels, count) in payload {
61-
let mut label_pairs = Vec::with_capacity(self.desc.variable_labels.len());
62-
for (name, label) in self.desc.variable_labels.iter().zip_eq_fast(labels) {
63-
let mut label_pair = LabelPair::default();
64-
label_pair.set_name(name.clone());
65-
label_pair.set_value(label);
66-
label_pairs.push(label_pair);
67-
}
68-
69-
let mut metric = Metric::new();
70-
metric.set_label(label_pairs);
71-
let mut gauge = Gauge::default();
72-
gauge.set_value(count as f64);
73-
metric.set_gauge(gauge);
74-
metrics.push(metric);
75-
}
76-
m.set_metric(metrics);
77-
m
35+
self.inner.with_label_values(&labels).inc();
7836
}
7937
}
8038

81-
pub type ErrorMetricRef<const N: usize> = Arc<ErrorMetric<N>>;
82-
8339
/// Metrics for counting errors in the system.
84-
/// The detailed error messages are not supposed to be stored in the metrics, but in the logs.
8540
///
8641
/// Please avoid adding new error metrics here. Instead, introduce new `error_type` for new errors.
8742
#[derive(Clone)]
8843
pub struct ErrorMetrics {
89-
pub user_sink_error: ErrorMetricRef<4>,
90-
pub user_compute_error: ErrorMetricRef<3>,
91-
pub user_source_error: ErrorMetricRef<4>,
44+
pub user_sink_error: ErrorMetric<4>,
45+
pub user_compute_error: ErrorMetric<3>,
46+
pub user_source_error: ErrorMetric<4>,
9247
}
9348

9449
impl ErrorMetrics {
95-
pub fn new() -> Self {
50+
pub fn new(registry: &Registry) -> Self {
9651
Self {
97-
user_sink_error: Arc::new(ErrorMetric::new(
98-
"user_sink_error",
52+
user_sink_error: ErrorMetric::new(
53+
"user_sink_error_cnt",
9954
"Sink errors in the system, queryable by tags",
10055
&["error_type", "sink_id", "sink_name", "fragment_id"],
101-
)),
102-
user_compute_error: Arc::new(ErrorMetric::new(
103-
"user_compute_error",
56+
registry,
57+
),
58+
user_compute_error: ErrorMetric::new(
59+
"user_compute_error_cnt",
10460
"Compute errors in the system, queryable by tags",
10561
&["error_type", "executor_name", "fragment_id"],
106-
)),
107-
user_source_error: Arc::new(ErrorMetric::new(
108-
"user_source_error",
62+
registry,
63+
),
64+
user_source_error: ErrorMetric::new(
65+
"user_source_error_cnt",
10966
"Source errors in the system, queryable by tags",
11067
&["error_type", "source_id", "source_name", "fragment_id"],
111-
)),
68+
registry,
69+
),
11270
}
11371
}
114-
115-
fn desc(&self) -> Vec<&Desc> {
116-
vec![
117-
&self.user_sink_error.desc,
118-
&self.user_compute_error.desc,
119-
&self.user_source_error.desc,
120-
]
121-
}
122-
123-
fn collect(&self) -> Vec<prometheus::proto::MetricFamily> {
124-
vec![
125-
self.user_sink_error.collect(),
126-
self.user_compute_error.collect(),
127-
self.user_source_error.collect(),
128-
]
129-
}
130-
}
131-
132-
impl Default for ErrorMetrics {
133-
fn default() -> Self {
134-
ErrorMetrics::new()
135-
}
136-
}
137-
138-
pub struct ErrorMetricsCollector {
139-
metrics: ErrorMetrics,
140-
}
141-
142-
impl Collector for ErrorMetricsCollector {
143-
fn desc(&self) -> Vec<&Desc> {
144-
self.metrics.desc()
145-
}
146-
147-
fn collect(&self) -> Vec<prometheus::proto::MetricFamily> {
148-
self.metrics.collect()
149-
}
150-
}
151-
152-
pub fn monitor_errors(registry: &Registry, metrics: ErrorMetrics) {
153-
let ec = ErrorMetricsCollector { metrics };
154-
registry.register(Box::new(ec)).unwrap()
15572
}
15673

157-
pub static GLOBAL_ERROR_METRICS: LazyLock<ErrorMetrics> = LazyLock::new(|| {
158-
let e = ErrorMetrics::new();
159-
monitor_errors(&GLOBAL_METRICS_REGISTRY, e.clone());
160-
e
161-
});
74+
pub static GLOBAL_ERROR_METRICS: LazyLock<ErrorMetrics> =
75+
LazyLock::new(|| ErrorMetrics::new(&GLOBAL_METRICS_REGISTRY));

0 commit comments

Comments
 (0)