提交 9a217794 编写于 作者: B beorn7

Protect gauge-based alerts against failed scrapes

Signed-off-by: Nbeorn7 <beorn@grafana.com>
上级 52707535
...@@ -7,9 +7,11 @@ ...@@ -7,9 +7,11 @@
{ {
alert: 'PrometheusBadConfig', alert: 'PrometheusBadConfig',
expr: ||| expr: |||
prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0 # Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{%(prometheusSelector)s}[5m]) == 0
||| % $._config, ||| % $._config,
'for': '15m', 'for': '10m',
labels: { labels: {
severity: 'critical', severity: 'critical',
}, },
...@@ -21,10 +23,12 @@ ...@@ -21,10 +23,12 @@
{ {
alert: 'PrometheusNotificationQueueRunningFull', alert: 'PrometheusNotificationQueueRunningFull',
expr: ||| expr: |||
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
( (
predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30)
> >
prometheus_notifications_queue_capacity{%(prometheusSelector)s} min_over_time(prometheus_notifications_queue_capacity{%(prometheusSelector)s}[5m])
) )
||| % $._config, ||| % $._config,
'for': '15m', 'for': '15m',
...@@ -79,7 +83,9 @@ ...@@ -79,7 +83,9 @@
{ {
alert: 'PrometheusNotConnectedToAlertmanagers', alert: 'PrometheusNotConnectedToAlertmanagers',
expr: ||| expr: |||
prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1 # Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s}[5m]) < 1
||| % $._config, ||| % $._config,
'for': '10m', 'for': '10m',
labels: { labels: {
...@@ -201,10 +207,12 @@ ...@@ -201,10 +207,12 @@
{ {
alert: 'PrometheusRemoteWriteBehind', alert: 'PrometheusRemoteWriteBehind',
expr: ||| expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
( (
prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s} max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s}[5m])
- on(job, instance) group_right - on(job, instance) group_right
prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s} max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s}[5m])
) )
> 120 > 120
||| % $._config, ||| % $._config,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册