From 8af10bed11d0eac53e8eab13a3df276cd613647d Mon Sep 17 00:00:00 2001 From: mrproliu <741550557@qq.com> Date: Wed, 13 May 2020 14:34:16 +0800 Subject: [PATCH] Support regex to include or exclude the names in alarm module (#4775) * Support regex to include or exclude the names, requested through mail list, https://lists.apache.org/x/thread.html/rb6d91093776cf61880425449d3d25f9d560cd32363c0075de9a85d13@%3Cdev.skywalking.apache.org%3E Co-authored-by: Mrproliu Co-authored-by: kezhenxu94 --- dist-material/alarm-settings-sample.yml | 10 +++ docs/en/setup/backend/backend-alarm.md | 4 +- .../server/core/alarm/provider/AlarmRule.java | 5 +- .../core/alarm/provider/RulesReader.java | 2 + .../core/alarm/provider/RunningRule.java | 36 +++++++++-- .../core/alarm/provider/RunningRuleTest.java | 64 +++++++++++++++++++ 6 files changed, 115 insertions(+), 6 deletions(-) diff --git a/dist-material/alarm-settings-sample.yml b/dist-material/alarm-settings-sample.yml index e332587a32..c5f7d551ec 100644 --- a/dist-material/alarm-settings-sample.yml +++ b/dist-material/alarm-settings-sample.yml @@ -39,6 +39,16 @@ rules: op: ">" period: 10 count: 1 + service_instance_resp_time_rule: + metrics-name: service_instance_resp_time + op: ">" + threshold: 1000 + # [Optional] Default, match all services in this metrics + include-names-regex: instance\_\d+ + period: 10 + count: 2 + silence-period: 5 + message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes #webhooks: # - http://127.0.0.1/notify/ diff --git a/docs/en/setup/backend/backend-alarm.md b/docs/en/setup/backend/backend-alarm.md index c00817bf79..387b1935f6 100644 --- a/docs/en/setup/backend/backend-alarm.md +++ b/docs/en/setup/backend/backend-alarm.md @@ -22,6 +22,8 @@ Alarm rule is constituted by following keys [List of all potential metrics name](#list-of-all-potential-metrics-name). - **Include names**. The following entity names are included in this rule. Please follow [Entity name define](#entity-name). - **Exclude names**. The following entity names are excluded in this rule. Please follow [Entity name define](#entity-name). +- **Include names regex**. Provide a regex to include the entity names. If both setting the include name list and include name regex, both rules will take effect. +- **Exclude names regex**. Provide a regex to exclude the exclude names. If both setting the exclude name list and exclude name regex, both rules will take effect. - **Threshold**. The target value. For multiple values metrics, such as **percentile**, the threshold is an array. Described like `value1, value2, value3, value4, value5`. Each value could the threshold for each value of the metrics. Set the value to `-` if don't want to trigger alarm by this or some of the values. @@ -80,7 +82,7 @@ We provided a default `alarm-setting.yml` in our distribution only for convenien 1. Service average response time over 1s in last 3 minutes. 1. Service success rate lower than 80% in last 2 minutes. 1. Percentile of service response time is over 1s in last 3 minutes -1. Service Instance average response time over 1s in last 2 minutes. +1. Service Instance average response time over 1s in last 2 minutes, and the instance name matches the regex. 1. Endpoint average response time over 1s in last 2 minutes. 1. Database access average response time over 1s in last 2 minutes. 1. Endpoint relation average response time over 1s in last 2 minutes. diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java index ef6672b0ba..f895972113 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java @@ -38,7 +38,9 @@ public class AlarmRule { private String metricsName; private ArrayList includeNames; + private String includeNamesRegex; private ArrayList excludeNames; + private String excludeNamesRegex; private String threshold; private String op; private int period; @@ -61,7 +63,8 @@ public class AlarmRule { return period == alarmRule.period && count == alarmRule.count && silencePeriod == alarmRule.silencePeriod && Objects .equals(alarmRuleName, alarmRule.alarmRuleName) && Objects.equals(metricsName, alarmRule.metricsName) && Objects .equals(includeNames, alarmRule.includeNames) && Objects.equals(excludeNames, alarmRule.excludeNames) && Objects - .equals(threshold, alarmRule.threshold) && Objects.equals(op, alarmRule.op) && Objects.equals(message, alarmRule.message); + .equals(threshold, alarmRule.threshold) && Objects.equals(op, alarmRule.op) && Objects.equals(message, alarmRule.message) + && Objects.equals(includeNamesRegex, alarmRule.includeNamesRegex) && Objects.equals(excludeNamesRegex, alarmRule.excludeNamesRegex); } @Override diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java index de93319b31..371f6c32a4 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java @@ -63,6 +63,8 @@ public class RulesReader { alarmRule.setMetricsName((String) metricsName); alarmRule.setIncludeNames((ArrayList) settings.getOrDefault("include-names", new ArrayList(0))); alarmRule.setExcludeNames((ArrayList) settings.getOrDefault("exclude-names", new ArrayList(0))); + alarmRule.setIncludeNamesRegex((String) settings.getOrDefault("include-names-regex", "")); + alarmRule.setExcludeNamesRegex((String) settings.getOrDefault("exclude-names-regex", "")); alarmRule.setThreshold(settings.get("threshold").toString()); alarmRule.setOp((String) settings.get("op")); alarmRule.setPeriod((Integer) settings.getOrDefault("period", 1)); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index 5310abd75c..00d61469fb 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -25,9 +25,12 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReentrantLock; +import java.util.regex.Pattern; + import lombok.RequiredArgsConstructor; import lombok.ToString; import lombok.extern.slf4j.Slf4j; +import org.apache.skywalking.apm.util.StringUtil; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm; import org.apache.skywalking.oap.server.core.analysis.metrics.DoubleValueHolder; @@ -59,6 +62,8 @@ public class RunningRule { private volatile MetricsValueType valueType; private final List includeNames; private final List excludeNames; + private final Pattern includeNamesRegex; + private final Pattern excludeNamesRegex; private final AlarmMessageFormatter formatter; public RunningRule(AlarmRule alarmRule) { @@ -78,6 +83,10 @@ public class RunningRule { this.includeNames = alarmRule.getIncludeNames(); this.excludeNames = alarmRule.getExcludeNames(); + this.includeNamesRegex = StringUtil.isNotEmpty(alarmRule.getIncludeNamesRegex()) ? + Pattern.compile(alarmRule.getIncludeNamesRegex()) : null; + this.excludeNamesRegex = StringUtil.isNotEmpty(alarmRule.getExcludeNamesRegex()) ? + Pattern.compile(alarmRule.getExcludeNamesRegex()) : null; this.formatter = new AlarmMessageFormatter(alarmRule.getMessage()); } @@ -97,19 +106,38 @@ public class RunningRule { return; } + final String metaName = meta.getName(); if (CollectionUtils.isNotEmpty(includeNames)) { - if (!includeNames.contains(meta.getName())) { + if (!includeNames.contains(metaName)) { if (log.isTraceEnabled()) { - log.trace("{} isn't in the including list {}", meta.getName(), includeNames); + log.trace("{} isn't in the including list {}", metaName, includeNames); } return; } } if (CollectionUtils.isNotEmpty(excludeNames)) { - if (excludeNames.contains(meta.getName())) { + if (excludeNames.contains(metaName)) { + if (log.isTraceEnabled()) { + log.trace("{} is in the excluding list {}", metaName, excludeNames); + } + return; + } + } + + if (includeNamesRegex != null) { + if (!includeNamesRegex.matcher(metaName).matches()) { + if (log.isTraceEnabled()) { + log.trace("{} doesn't match the include regex {}", metaName, includeNamesRegex); + } + return; + } + } + + if (excludeNamesRegex != null) { + if (excludeNamesRegex.matcher(metaName).matches()) { if (log.isTraceEnabled()) { - log.trace("{} is in the excluding list {}", meta.getName(), excludeNames); + log.trace("{} matches the exclude regex {}", metaName, excludeNamesRegex); } return; } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java index baa7c2a107..53e188320d 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java @@ -262,6 +262,70 @@ public class RunningRuleTest { Assert.assertEquals(0, runningRule.check().size()); } + @Test + public void testIncludeNamesRegex() { + AlarmRule alarmRule = new AlarmRule(); + alarmRule.setAlarmRuleName("endpoint_percent_rule"); + alarmRule.setMetricsName("endpoint_percent"); + alarmRule.setOp("<"); + alarmRule.setThreshold("1000"); + alarmRule.setCount(1); + alarmRule.setPeriod(10); + alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); + alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+"); + + RunningRule runningRule = new RunningRule(alarmRule); + + long timeInPeriod1 = 201808301434L; + long timeInPeriod2 = 201808301436L; + long timeInPeriod3 = 201808301439L; + + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 70)); + runningRule.in(getMetaInAlarm(223), getMetrics(timeInPeriod3, 74)); + + // check at 201808301440 + Assert.assertEquals(1, runningRule.check().size()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); + // check at 201808301441 + Assert.assertEquals(1, runningRule.check().size()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301446")); + // check at 201808301442 + Assert.assertEquals(0, runningRule.check().size()); + } + + @Test + public void testExcludeNamesRegex() { + AlarmRule alarmRule = new AlarmRule(); + alarmRule.setAlarmRuleName("endpoint_percent_rule"); + alarmRule.setMetricsName("endpoint_percent"); + alarmRule.setOp("<"); + alarmRule.setThreshold("1000"); + alarmRule.setCount(1); + alarmRule.setPeriod(10); + alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); + alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+"); + + RunningRule runningRule = new RunningRule(alarmRule); + + long timeInPeriod1 = 201808301434L; + long timeInPeriod2 = 201808301436L; + long timeInPeriod3 = 201808301439L; + + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 70)); + runningRule.in(getMetaInAlarm(223), getMetrics(timeInPeriod3, 74)); + + // check at 201808301440 + Assert.assertEquals(1, runningRule.check().size()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); + // check at 201808301441 + Assert.assertEquals(1, runningRule.check().size()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301446")); + // check at 201808301442 + Assert.assertEquals(0, runningRule.check().size()); + } + private MetaInAlarm getMetaInAlarm(int id) { return new MetaInAlarm() { @Override -- GitLab