未验证 提交 8af10bed 编写于 作者: 静夜思朝颜's avatar 静夜思朝颜 提交者: GitHub

Support regex to include or exclude the names in alarm module (#4775)

* Support regex to include or exclude the names, requested through mail list, https://lists.apache.org/x/thread.html/rb6d91093776cf61880425449d3d25f9d560cd32363c0075de9a85d13@%3Cdev.skywalking.apache.org%3ECo-authored-by: NMrproliu <mrproliu@lagou.com>
Co-authored-by: Nkezhenxu94 <kezhenxu94@apache.org>
上级 cd41b223
......@@ -39,6 +39,16 @@ rules:
op: ">"
period: 10
count: 1
service_instance_resp_time_rule:
metrics-name: service_instance_resp_time
op: ">"
threshold: 1000
# [Optional] Default, match all services in this metrics
include-names-regex: instance\_\d+
period: 10
count: 2
silence-period: 5
message: Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes
#webhooks:
# - http://127.0.0.1/notify/
......
......@@ -22,6 +22,8 @@ Alarm rule is constituted by following keys
[List of all potential metrics name](#list-of-all-potential-metrics-name).
- **Include names**. The following entity names are included in this rule. Please follow [Entity name define](#entity-name).
- **Exclude names**. The following entity names are excluded in this rule. Please follow [Entity name define](#entity-name).
- **Include names regex**. Provide a regex to include the entity names. If both setting the include name list and include name regex, both rules will take effect.
- **Exclude names regex**. Provide a regex to exclude the exclude names. If both setting the exclude name list and exclude name regex, both rules will take effect.
- **Threshold**. The target value.
For multiple values metrics, such as **percentile**, the threshold is an array. Described like `value1, value2, value3, value4, value5`.
Each value could the threshold for each value of the metrics. Set the value to `-` if don't want to trigger alarm by this or some of the values.
......@@ -80,7 +82,7 @@ We provided a default `alarm-setting.yml` in our distribution only for convenien
1. Service average response time over 1s in last 3 minutes.
1. Service success rate lower than 80% in last 2 minutes.
1. Percentile of service response time is over 1s in last 3 minutes
1. Service Instance average response time over 1s in last 2 minutes.
1. Service Instance average response time over 1s in last 2 minutes, and the instance name matches the regex.
1. Endpoint average response time over 1s in last 2 minutes.
1. Database access average response time over 1s in last 2 minutes.
1. Endpoint relation average response time over 1s in last 2 minutes.
......
......@@ -38,7 +38,9 @@ public class AlarmRule {
private String metricsName;
private ArrayList<String> includeNames;
private String includeNamesRegex;
private ArrayList<String> excludeNames;
private String excludeNamesRegex;
private String threshold;
private String op;
private int period;
......@@ -61,7 +63,8 @@ public class AlarmRule {
return period == alarmRule.period && count == alarmRule.count && silencePeriod == alarmRule.silencePeriod && Objects
.equals(alarmRuleName, alarmRule.alarmRuleName) && Objects.equals(metricsName, alarmRule.metricsName) && Objects
.equals(includeNames, alarmRule.includeNames) && Objects.equals(excludeNames, alarmRule.excludeNames) && Objects
.equals(threshold, alarmRule.threshold) && Objects.equals(op, alarmRule.op) && Objects.equals(message, alarmRule.message);
.equals(threshold, alarmRule.threshold) && Objects.equals(op, alarmRule.op) && Objects.equals(message, alarmRule.message)
&& Objects.equals(includeNamesRegex, alarmRule.includeNamesRegex) && Objects.equals(excludeNamesRegex, alarmRule.excludeNamesRegex);
}
@Override
......
......@@ -63,6 +63,8 @@ public class RulesReader {
alarmRule.setMetricsName((String) metricsName);
alarmRule.setIncludeNames((ArrayList) settings.getOrDefault("include-names", new ArrayList(0)));
alarmRule.setExcludeNames((ArrayList) settings.getOrDefault("exclude-names", new ArrayList(0)));
alarmRule.setIncludeNamesRegex((String) settings.getOrDefault("include-names-regex", ""));
alarmRule.setExcludeNamesRegex((String) settings.getOrDefault("exclude-names-regex", ""));
alarmRule.setThreshold(settings.get("threshold").toString());
alarmRule.setOp((String) settings.get("op"));
alarmRule.setPeriod((Integer) settings.getOrDefault("period", 1));
......
......@@ -25,9 +25,12 @@ import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;
import lombok.RequiredArgsConstructor;
import lombok.ToString;
import lombok.extern.slf4j.Slf4j;
import org.apache.skywalking.apm.util.StringUtil;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm;
import org.apache.skywalking.oap.server.core.analysis.metrics.DoubleValueHolder;
......@@ -59,6 +62,8 @@ public class RunningRule {
private volatile MetricsValueType valueType;
private final List<String> includeNames;
private final List<String> excludeNames;
private final Pattern includeNamesRegex;
private final Pattern excludeNamesRegex;
private final AlarmMessageFormatter formatter;
public RunningRule(AlarmRule alarmRule) {
......@@ -78,6 +83,10 @@ public class RunningRule {
this.includeNames = alarmRule.getIncludeNames();
this.excludeNames = alarmRule.getExcludeNames();
this.includeNamesRegex = StringUtil.isNotEmpty(alarmRule.getIncludeNamesRegex()) ?
Pattern.compile(alarmRule.getIncludeNamesRegex()) : null;
this.excludeNamesRegex = StringUtil.isNotEmpty(alarmRule.getExcludeNamesRegex()) ?
Pattern.compile(alarmRule.getExcludeNamesRegex()) : null;
this.formatter = new AlarmMessageFormatter(alarmRule.getMessage());
}
......@@ -97,19 +106,38 @@ public class RunningRule {
return;
}
final String metaName = meta.getName();
if (CollectionUtils.isNotEmpty(includeNames)) {
if (!includeNames.contains(meta.getName())) {
if (!includeNames.contains(metaName)) {
if (log.isTraceEnabled()) {
log.trace("{} isn't in the including list {}", meta.getName(), includeNames);
log.trace("{} isn't in the including list {}", metaName, includeNames);
}
return;
}
}
if (CollectionUtils.isNotEmpty(excludeNames)) {
if (excludeNames.contains(meta.getName())) {
if (excludeNames.contains(metaName)) {
if (log.isTraceEnabled()) {
log.trace("{} is in the excluding list {}", metaName, excludeNames);
}
return;
}
}
if (includeNamesRegex != null) {
if (!includeNamesRegex.matcher(metaName).matches()) {
if (log.isTraceEnabled()) {
log.trace("{} doesn't match the include regex {}", metaName, includeNamesRegex);
}
return;
}
}
if (excludeNamesRegex != null) {
if (excludeNamesRegex.matcher(metaName).matches()) {
if (log.isTraceEnabled()) {
log.trace("{} is in the excluding list {}", meta.getName(), excludeNames);
log.trace("{} matches the exclude regex {}", metaName, excludeNamesRegex);
}
return;
}
......
......@@ -262,6 +262,70 @@ public class RunningRuleTest {
Assert.assertEquals(0, runningRule.check().size());
}
@Test
public void testIncludeNamesRegex() {
AlarmRule alarmRule = new AlarmRule();
alarmRule.setAlarmRuleName("endpoint_percent_rule");
alarmRule.setMetricsName("endpoint_percent");
alarmRule.setOp("<");
alarmRule.setThreshold("1000");
alarmRule.setCount(1);
alarmRule.setPeriod(10);
alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
alarmRule.setIncludeNamesRegex("Service\\_1(\\d)+");
RunningRule runningRule = new RunningRule(alarmRule);
long timeInPeriod1 = 201808301434L;
long timeInPeriod2 = 201808301436L;
long timeInPeriod3 = 201808301439L;
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 70));
runningRule.in(getMetaInAlarm(223), getMetrics(timeInPeriod3, 74));
// check at 201808301440
Assert.assertEquals(1, runningRule.check().size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
Assert.assertEquals(1, runningRule.check().size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301446"));
// check at 201808301442
Assert.assertEquals(0, runningRule.check().size());
}
@Test
public void testExcludeNamesRegex() {
AlarmRule alarmRule = new AlarmRule();
alarmRule.setAlarmRuleName("endpoint_percent_rule");
alarmRule.setMetricsName("endpoint_percent");
alarmRule.setOp("<");
alarmRule.setThreshold("1000");
alarmRule.setCount(1);
alarmRule.setPeriod(10);
alarmRule.setMessage("Response time of service instance {name} is more than 1000ms in 2 minutes of last 10 minutes");
alarmRule.setExcludeNamesRegex("Service\\_2(\\d)+");
RunningRule runningRule = new RunningRule(alarmRule);
long timeInPeriod1 = 201808301434L;
long timeInPeriod2 = 201808301436L;
long timeInPeriod3 = 201808301439L;
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getMetrics(timeInPeriod2, 70));
runningRule.in(getMetaInAlarm(223), getMetrics(timeInPeriod3, 74));
// check at 201808301440
Assert.assertEquals(1, runningRule.check().size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
Assert.assertEquals(1, runningRule.check().size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301446"));
// check at 201808301442
Assert.assertEquals(0, runningRule.check().size());
}
private MetaInAlarm getMetaInAlarm(int id) {
return new MetaInAlarm() {
@Override
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册