diff --git a/dist-material/alarm-settings-sample.yml b/dist-material/alarm-settings-sample.yml index e332587a32ab8825896e16314b4f68c2f5328e79..c5f7d551ec8313a3d14df545dcb8ce9ceb13dd49 100644 --- a/dist-material/alarm-settings-sample.yml +++ b/dist-material/alarm-settings-sample.yml @@ -39,6 +39,16 @@ rules: op: ">" period: 10 count: 1 + service_instance_resp_time_rule: + metrics-name: service_instance_resp_time + op: ">" + threshold: 1000 + # [Optional] Default, match all services in this metrics + include-names-regex: instance\_\d+ + period: 10 + count: 2 + silence-period: 5 + message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes #webhooks: # - http://127.0.0.1/notify/ diff --git a/docs/en/setup/backend/backend-alarm.md b/docs/en/setup/backend/backend-alarm.md index c00817bf7975f7ee6826163ae49783854bdf4773..387b1935f67e640725d0c4bc1f44bbfb26eb96de 100644 --- a/docs/en/setup/backend/backend-alarm.md +++ b/docs/en/setup/backend/backend-alarm.md @@ -22,6 +22,8 @@ Alarm rule is constituted by following keys [List of all potential metrics name](#list-of-all-potential-metrics-name). - **Include names**. The following entity names are included in this rule. Please follow [Entity name define](#entity-name). - **Exclude names**. The following entity names are excluded in this rule. Please follow [Entity name define](#entity-name). +- **Include names regex**. Provide a regex to include the entity names. If both setting the include name list and include name regex, both rules will take effect. +- **Exclude names regex**. Provide a regex to exclude the exclude names. If both setting the exclude name list and exclude name regex, both rules will take effect. - **Threshold**. The target value. For multiple values metrics, such as **percentile**, the threshold is an array. Described like `value1, value2, value3, value4, value5`. Each value could the threshold for each value of the metrics. Set the value to `-` if don't want to trigger alarm by this or some of the values. @@ -80,7 +82,7 @@ We provided a default `alarm-setting.yml` in our distribution only for convenien 1. Service average response time over 1s in last 3 minutes. 1. Service success rate lower than 80% in last 2 minutes. 1. Percentile of service response time is over 1s in last 3 minutes -1. Service Instance average response time over 1s in last 2 minutes. +1. Service Instance average response time over 1s in last 2 minutes, and the instance name matches the regex. 1. Endpoint average response time over 1s in last 2 minutes. 1. Database access average response time over 1s in last 2 minutes. 1. Endpoint relation average response time over 1s in last 2 minutes. diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java index ef6672b0baa260b6008f90b4e1b9eade3dbdb0d2..f895972113b4eeef24796ad418e434c2fcaa3e30 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/AlarmRule.java @@ -38,7 +38,9 @@ public class AlarmRule { private String metricsName; private ArrayList includeNames; + private String includeNamesRegex; private ArrayList excludeNames; + private String excludeNamesRegex; private String threshold; private String op; private int period; @@ -61,7 +63,8 @@ public class AlarmRule { return period == alarmRule.period && count == alarmRule.count && silencePeriod == alarmRule.silencePeriod && Objects .equals(alarmRuleName, alarmRule.alarmRuleName) && Objects.equals(metricsName, alarmRule.metricsName) && Objects .equals(includeNames, alarmRule.includeNames) && Objects.equals(excludeNames, alarmRule.excludeNames) && Objects - .equals(threshold, alarmRule.threshold) && Objects.equals(op, alarmRule.op) && Objects.equals(message, alarmRule.message); + .equals(threshold, alarmRule.threshold) && Objects.equals(op, alarmRule.op) && Objects.equals(message, alarmRule.message) + && Objects.equals(includeNamesRegex, alarmRule.includeNamesRegex) && Objects.equals(excludeNamesRegex, alarmRule.excludeNamesRegex); } @Override diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java index de93319b31e78ad90cd4f25b17214dd50dcc787b..371f6c32a45eb36f9b8500fbc2c9f2f4dfe4f0ea 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RulesReader.java @@ -63,6 +63,8 @@ public class RulesReader { alarmRule.setMetricsName((String) metricsName); alarmRule.setIncludeNames((ArrayList) settings.getOrDefault("include-names", new ArrayList(0))); alarmRule.setExcludeNames((ArrayList) settings.getOrDefault("exclude-names", new ArrayList(0))); + alarmRule.setIncludeNamesRegex((String) settings.getOrDefault("include-names-regex", "")); + alarmRule.setExcludeNamesRegex((String) settings.getOrDefault("exclude-names-regex", "")); alarmRule.setThreshold(settings.get("threshold").toString()); alarmRule.setOp((String) settings.get("op")); alarmRule.setPeriod((Integer) settings.getOrDefault("period", 1)); diff --git a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java index 5310abd75cd4cfd0ed82859ee432c8c2fb80518e..00d61469fbcb1e37e07a351cf722fea29854df4c 100644 --- a/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java +++ b/oap-server/server-alarm-plugin/src/main/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRule.java @@ -25,9 +25,12 @@ import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.locks.ReentrantLock; +import java.util.regex.Pattern; + import lombok.RequiredArgsConstructor; import lombok.ToString; import lombok.extern.slf4j.Slf4j; +import org.apache.skywalking.apm.util.StringUtil; import org.apache.skywalking.oap.server.core.alarm.AlarmMessage; import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm; import org.apache.skywalking.oap.server.core.analysis.metrics.DoubleValueHolder; @@ -59,6 +62,8 @@ public class RunningRule { private volatile MetricsValueType valueType; private final List includeNames; private final List excludeNames; + private final Pattern includeNamesRegex; + private final Pattern excludeNamesRegex; private final AlarmMessageFormatter formatter; public RunningRule(AlarmRule alarmRule) { @@ -78,6 +83,10 @@ public class RunningRule { this.includeNames = alarmRule.getIncludeNames(); this.excludeNames = alarmRule.getExcludeNames(); + this.includeNamesRegex = StringUtil.isNotEmpty(alarmRule.getIncludeNamesRegex()) ? + Pattern.compile(alarmRule.getIncludeNamesRegex()) : null; + this.excludeNamesRegex = StringUtil.isNotEmpty(alarmRule.getExcludeNamesRegex()) ? + Pattern.compile(alarmRule.getExcludeNamesRegex()) : null; this.formatter = new AlarmMessageFormatter(alarmRule.getMessage()); } @@ -97,19 +106,38 @@ public class RunningRule { return; } + final String metaName = meta.getName(); if (CollectionUtils.isNotEmpty(includeNames)) { - if (!includeNames.contains(meta.getName())) { + if (!includeNames.contains(metaName)) { if (log.isTraceEnabled()) { - log.trace("{} isn't in the including list {}", meta.getName(), includeNames); + log.trace("{} isn't in the including list {}", metaName, includeNames); } return; } } if (CollectionUtils.isNotEmpty(excludeNames)) { - if (excludeNames.contains(meta.getName())) { + if (excludeNames.contains(metaName)) { + if (log.isTraceEnabled()) { + log.trace("{} is in the excluding list {}", metaName, excludeNames); + } + return; + } + } + + if (includeNamesRegex != null) { + if (!includeNamesRegex.matcher(metaName).matches()) { + if (log.isTraceEnabled()) { + log.trace("{} doesn't match the include regex {}", metaName, includeNamesRegex); + } + return; + } + } + + if (excludeNamesRegex != null) { + if (excludeNamesRegex.matcher(metaName).matches()) { if (log.isTraceEnabled()) { - log.trace("{} is in the excluding list {}", meta.getName(), excludeNames); + log.trace("{} matches the exclude regex {}", metaName, excludeNamesRegex); } return; } diff --git a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java index baa7c2a1072642587d9ba73c6b981b5cf6e1bf9a..53e188320dadf924d1810050d98472b3b72018e1 100644 --- a/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java +++ b/oap-server/server-alarm-plugin/src/test/java/org/apache/skywalking/oap/server/core/alarm/provider/RunningRuleTest.java @@ -262,6 +262,70 @@ public class RunningRuleTest { Assert.assertEquals(0, runningRule.check().size()); } + @Test + public void testIncludeNamesRegex() { + AlarmRule alarmRule = new AlarmRule(); + alarmRule.setAlarmRuleName("endpoint_percent_rule"); + alarmRule.setMetricsName("endpoint_percent"); + alarmRule.setOp("<"); + alarmRule.setThreshold("1000"); + alarmRule.setCount(1); + alarmRule.setPeriod(10); + alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); + alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+"); + + RunningRule runningRule = new RunningRule(alarmRule); + + long timeInPeriod1 = 201808301434L; + long timeInPeriod2 = 201808301436L; + long timeInPeriod3 = 201808301439L; + + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 70)); + runningRule.in(getMetaInAlarm(223), getMetrics(timeInPeriod3, 74)); + + // check at 201808301440 + Assert.assertEquals(1, runningRule.check().size()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); + // check at 201808301441 + Assert.assertEquals(1, runningRule.check().size()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301446")); + // check at 201808301442 + Assert.assertEquals(0, runningRule.check().size()); + } + + @Test + public void testExcludeNamesRegex() { + AlarmRule alarmRule = new AlarmRule(); + alarmRule.setAlarmRuleName("endpoint_percent_rule"); + alarmRule.setMetricsName("endpoint_percent"); + alarmRule.setOp("<"); + alarmRule.setThreshold("1000"); + alarmRule.setCount(1); + alarmRule.setPeriod(10); + alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes"); + alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+"); + + RunningRule runningRule = new RunningRule(alarmRule); + + long timeInPeriod1 = 201808301434L; + long timeInPeriod2 = 201808301436L; + long timeInPeriod3 = 201808301439L; + + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70)); + runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 70)); + runningRule.in(getMetaInAlarm(223), getMetrics(timeInPeriod3, 74)); + + // check at 201808301440 + Assert.assertEquals(1, runningRule.check().size()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441")); + // check at 201808301441 + Assert.assertEquals(1, runningRule.check().size()); + runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301446")); + // check at 201808301442 + Assert.assertEquals(0, runningRule.check().size()); + } + private MetaInAlarm getMetaInAlarm(int id) { return new MetaInAlarm() { @Override