未验证 提交 19836e64 编写于 作者: G Gao Hongtao 提交者: GitHub

Add labeled metrics to alarm system (#5276)

上级 4e606902
......@@ -24,6 +24,14 @@ Alarm rule is constituted by following keys
- **Exclude names**. The following entity names are excluded in this rule. Please follow [Entity name define](#entity-name).
- **Include names regex**. Provide a regex to include the entity names. If both setting the include name list and include name regex, both rules will take effect.
- **Exclude names regex**. Provide a regex to exclude the exclude names. If both setting the exclude name list and exclude name regex, both rules will take effect.
- **Include labels**. The following labels of the metric are included in this rule.
- **Exclude labels**. The following labels of the metric are excluded in this rule.
- **Include labels regex**. Provide a regex to include labels. If both setting the include label list and include label regex, both rules will take effect.
- **Exclude labels regex**. Provide a regex to exclude labels. If both setting the exclude label list and exclude label regex, both rules will take effect.
*The settings of labels is required by meter-system which intends to store metrics from label-system platform, just like Prometheus, Micrometer, etc.
The function supports the above four settings should implement `LabeledValueHolder`.*
- **Threshold**. The target value.
For multiple values metrics, such as **percentile**, the threshold is an array. Described like `value1, value2, value3, value4, value5`.
Each value could the threshold for each value of the metrics. Set the value to `-` if don't want to trigger alarm by this or some of the values.
......@@ -75,6 +83,16 @@ rules:
count: 3
silence-period: 5
message: Percentile response time of service {name} alarm in 3 minutes of last 10 minutes, due to more than one condition of p50 > 1000, p75 > 1000, p90 > 1000, p95 > 1000, p99 > 1000
meter_service_status_code_rule:
metrics-name: meter_status_code
exclude-labels:
- "200"
op: ">"
threshold: 10
period: 10
count: 3
silence-period: 5
message: The request number of entity {name} non-200 status is more than expected.
```
### Default alarm rules
......
......@@ -41,6 +41,10 @@ public class AlarmRule {
private String includeNamesRegex;
private ArrayList<String> excludeNames;
private String excludeNamesRegex;
private ArrayList<String> includeLabels;
private String includeLabelsRegex;
private ArrayList<String> excludeLabels;
private String excludeLabelsRegex;
private String threshold;
private String op;
private int period;
......
......@@ -19,5 +19,5 @@
package org.apache.skywalking.oap.server.core.alarm.provider;
public enum MetricsValueType {
LONG, INT, DOUBLE, MULTI_INTS
LONG, INT, DOUBLE, LABELED_LONG, MULTI_INTS
}
......@@ -65,6 +65,10 @@ public class RulesReader {
alarmRule.setExcludeNames((ArrayList) settings.getOrDefault("exclude-names", new ArrayList(0)));
alarmRule.setIncludeNamesRegex((String) settings.getOrDefault("include-names-regex", ""));
alarmRule.setExcludeNamesRegex((String) settings.getOrDefault("exclude-names-regex", ""));
alarmRule.setIncludeLabels((ArrayList) settings.getOrDefault("include-labels", new ArrayList(0)));
alarmRule.setExcludeLabels((ArrayList) settings.getOrDefault("exclude-labels", new ArrayList(0)));
alarmRule.setIncludeLabelsRegex((String) settings.getOrDefault("include-labels-regex", ""));
alarmRule.setExcludeLabelsRegex((String) settings.getOrDefault("exclude-labels-regex", ""));
alarmRule.setThreshold(settings.get("threshold").toString());
alarmRule.setOp((String) settings.get("op"));
alarmRule.setPeriod((Integer) settings.getOrDefault("period", 1));
......
......@@ -20,6 +20,7 @@ package org.apache.skywalking.oap.server.core.alarm.provider;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
......@@ -33,8 +34,10 @@ import lombok.extern.slf4j.Slf4j;
import org.apache.skywalking.apm.util.StringUtil;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm;
import org.apache.skywalking.oap.server.core.analysis.metrics.DataTable;
import org.apache.skywalking.oap.server.core.analysis.metrics.DoubleValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.IntValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.LabeledValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.LongValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics;
import org.apache.skywalking.oap.server.core.analysis.metrics.MultiIntValuesHolder;
......@@ -64,6 +67,10 @@ public class RunningRule {
private final List<String> excludeNames;
private final Pattern includeNamesRegex;
private final Pattern excludeNamesRegex;
private final List<String> includeLabels;
private final List<String> excludeLabels;
private final Pattern includeLabelsRegex;
private final Pattern excludeLabelsRegex;
private final AlarmMessageFormatter formatter;
public RunningRule(AlarmRule alarmRule) {
......@@ -87,6 +94,12 @@ public class RunningRule {
Pattern.compile(alarmRule.getIncludeNamesRegex()) : null;
this.excludeNamesRegex = StringUtil.isNotEmpty(alarmRule.getExcludeNamesRegex()) ?
Pattern.compile(alarmRule.getExcludeNamesRegex()) : null;
this.includeLabels = alarmRule.getIncludeLabels();
this.excludeLabels = alarmRule.getExcludeLabels();
this.includeLabelsRegex = StringUtil.isNotEmpty(alarmRule.getIncludeLabelsRegex()) ?
Pattern.compile(alarmRule.getIncludeLabelsRegex()) : null;
this.excludeLabelsRegex = StringUtil.isNotEmpty(alarmRule.getExcludeLabelsRegex()) ?
Pattern.compile(alarmRule.getExcludeLabelsRegex()) : null;
this.formatter = new AlarmMessageFormatter(alarmRule.getMessage());
}
......@@ -107,40 +120,8 @@ public class RunningRule {
}
final String metaName = meta.getName();
if (CollectionUtils.isNotEmpty(includeNames)) {
if (!includeNames.contains(metaName)) {
if (log.isTraceEnabled()) {
log.trace("{} isn't in the including list {}", metaName, includeNames);
}
return;
}
}
if (CollectionUtils.isNotEmpty(excludeNames)) {
if (excludeNames.contains(metaName)) {
if (log.isTraceEnabled()) {
log.trace("{} is in the excluding list {}", metaName, excludeNames);
}
return;
}
}
if (includeNamesRegex != null) {
if (!includeNamesRegex.matcher(metaName).matches()) {
if (log.isTraceEnabled()) {
log.trace("{} doesn't match the include regex {}", metaName, includeNamesRegex);
}
return;
}
}
if (excludeNamesRegex != null) {
if (excludeNamesRegex.matcher(metaName).matches()) {
if (log.isTraceEnabled()) {
log.trace("{} matches the exclude regex {}", metaName, excludeNamesRegex);
}
return;
}
if (!validate(metaName, includeNames, excludeNames, includeNamesRegex, excludeNamesRegex)) {
return;
}
if (valueType == null) {
......@@ -156,6 +137,18 @@ public class RunningRule {
} else if (metrics instanceof MultiIntValuesHolder) {
valueType = MetricsValueType.MULTI_INTS;
threshold.setType(MetricsValueType.MULTI_INTS);
} else if (metrics instanceof LabeledValueHolder) {
if (((LabeledValueHolder) metrics).getValue().keys().stream()
.noneMatch(label -> validate(
label,
includeLabels,
excludeLabels,
includeLabelsRegex,
excludeLabelsRegex))) {
return;
}
valueType = MetricsValueType.LABELED_LONG;
threshold.setType(MetricsValueType.LONG);
} else {
log.warn("Unsupported value type {}", valueType);
return;
......@@ -168,6 +161,50 @@ public class RunningRule {
}
}
/**
* Validate target whether matching rules which is included list, excludes list, include regular expression
* or exclude regular expression.
*/
private boolean validate(String target, List<String> includeList, List<String> excludeList,
Pattern includeRegex, Pattern excludeRegex) {
if (CollectionUtils.isNotEmpty(includeList)) {
if (!includeList.contains(target)) {
if (log.isTraceEnabled()) {
log.trace("{} isn't in the including list {}", target, includeList);
}
return false;
}
}
if (CollectionUtils.isNotEmpty(excludeList)) {
if (excludeList.contains(target)) {
if (log.isTraceEnabled()) {
log.trace("{} is in the excluding list {}", target, excludeList);
}
return false;
}
}
if (includeRegex != null) {
if (!includeRegex.matcher(target).matches()) {
if (log.isTraceEnabled()) {
log.trace("{} doesn't match the include regex {}", target, includeRegex);
}
return false;
}
}
if (excludeRegex != null) {
if (excludeRegex.matcher(target).matches()) {
if (log.isTraceEnabled()) {
log.trace("{} matches the exclude regex {}", target, excludeRegex);
}
return false;
}
}
return true;
}
/**
* Move the buffer window to give time.
*
......@@ -365,6 +402,20 @@ public class RunningRule {
}
}
break;
case LABELED_LONG:
DataTable values = ((LabeledValueHolder) metrics).getValue();
lexpected = RunningRule.this.threshold.getLongThreshold();
if (values.keys().stream().anyMatch(label ->
validate(
label,
RunningRule.this.includeLabels,
RunningRule.this.excludeLabels,
RunningRule.this.includeLabelsRegex,
RunningRule.this.excludeLabelsRegex)
&& op.test(lexpected, values.get(label)))) {
matchCount++;
}
break;
}
}
......@@ -404,6 +455,11 @@ public class RunningRule {
int[] iArr = ((MultiIntValuesHolder) m).getValues();
r.add(new TraceLogMetric(m.getTimeBucket(), Arrays.stream(iArr).boxed().toArray(Number[]::new)));
break;
case LABELED_LONG:
DataTable dt = ((LabeledValueHolder) m).getValue();
TraceLogMetric l = new TraceLogMetric(m.getTimeBucket(), dt.sortedValues(Comparator.naturalOrder()).toArray(new Number[0]));
l.labels = dt.sortedKeys(Comparator.naturalOrder()).toArray(new String[0]);
r.add(l);
}
});
return r;
......@@ -414,5 +470,6 @@ public class RunningRule {
private static class TraceLogMetric {
private final long timeBucket;
private final Number[] value;
private String[] labels;
}
}
......@@ -23,11 +23,15 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import lombok.Getter;
import lombok.Setter;
import org.apache.skywalking.oap.server.core.Const;
import org.apache.skywalking.oap.server.core.alarm.AlarmCallback;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm;
import org.apache.skywalking.oap.server.core.analysis.metrics.DataTable;
import org.apache.skywalking.oap.server.core.analysis.metrics.IntValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.LabeledValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics;
import org.apache.skywalking.oap.server.core.analysis.metrics.MultiIntValuesHolder;
import org.apache.skywalking.oap.server.core.remote.grpc.proto.RemoteData;
......@@ -149,6 +153,22 @@ public class RunningRuleTest {
.getAlarmMessage());
}
@Test
public void testLabeledAlarm() {
AlarmRule alarmRule = new AlarmRule();
alarmRule.setIncludeLabels(Lists.newArrayList("95", "99"));
assertLabeled(alarmRule);
alarmRule = new AlarmRule();
alarmRule.setIncludeLabelsRegex("9\\d{1}");
assertLabeled(alarmRule);
alarmRule = new AlarmRule();
alarmRule.setExcludeLabels(Lists.newArrayList("50", "75"));
assertLabeled(alarmRule);
alarmRule = new AlarmRule();
alarmRule.setExcludeLabelsRegex("^[5-7][0-9]$");
assertLabeled(alarmRule);
}
@Test
public void testNoAlarm() {
AlarmRule alarmRule = new AlarmRule();
......@@ -386,6 +406,13 @@ public class RunningRuleTest {
}
private Metrics getLabeledValueMetrics(long timeBucket, String values) {
MockLabeledValueMetrics mockLabeledValueMetrics = new MockLabeledValueMetrics();
mockLabeledValueMetrics.setValue(new DataTable(values));
mockLabeledValueMetrics.setTimeBucket(timeBucket);
return mockLabeledValueMetrics;
}
private class MockMetrics extends Metrics implements IntValueHolder {
private int value;
......@@ -491,4 +518,88 @@ public class RunningRuleTest {
return null;
}
}
private class MockLabeledValueMetrics extends Metrics implements LabeledValueHolder {
@Getter
@Setter
private DataTable value;
@Override
public String id() {
return null;
}
@Override
public void combine(Metrics metrics) {
}
@Override
public void calculate() {
}
@Override
public Metrics toHour() {
return null;
}
@Override
public Metrics toDay() {
return null;
}
@Override
public int remoteHashCode() {
return 0;
}
@Override
public void deserialize(RemoteData remoteData) {
}
@Override
public RemoteData.Builder serialize() {
return null;
}
}
private void assertLabeled(AlarmRule alarmRule) {
alarmRule.setAlarmRuleName("endpoint_percent_alarm_rule");
alarmRule.setMetricsName("endpoint_percent");
alarmRule.setOp(">");
alarmRule.setThreshold("10");
alarmRule.setCount(3);
alarmRule.setPeriod(15);
alarmRule.setMessage("response percentile of endpoint {name} is lower than expected value");
RunningRule runningRule = new RunningRule(alarmRule);
LocalDateTime startTime = TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301440");
long timeInPeriod1 = 201808301434L;
long timeInPeriod2 = 201808301436L;
long timeInPeriod3 = 201808301438L;
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod1, "50,17|99,11"));
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod2, "75,15|95,12"));
runningRule.in(getMetaInAlarm(123), getLabeledValueMetrics(timeInPeriod3, "90,1|99,20"));
// check at 201808301440
List<AlarmMessage> alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
alarmMessages = runningRule.check();
Assert.assertEquals(0, alarmMessages.size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
// check at 201808301442
alarmMessages = runningRule.check();
Assert.assertEquals(1, alarmMessages.size());
Assert.assertEquals(
"response percentile of endpoint Service_123 is lower than expected value", alarmMessages.get(0)
.getAlarmMessage());
}
}
......@@ -28,6 +28,7 @@ import org.apache.skywalking.oap.server.core.Const;
import org.apache.skywalking.oap.server.core.UnexpectedException;
import org.apache.skywalking.oap.server.core.analysis.manual.instance.InstanceTraffic;
import org.apache.skywalking.oap.server.core.analysis.meter.MeterEntity;
import org.apache.skywalking.oap.server.core.analysis.metrics.LongValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics;
import org.apache.skywalking.oap.server.core.analysis.metrics.annotation.ConstOne;
import org.apache.skywalking.oap.server.core.analysis.metrics.annotation.Entrance;
......@@ -39,7 +40,7 @@ import org.apache.skywalking.oap.server.core.storage.annotation.Column;
@MeterFunction(functionName = "avg")
@ToString
public abstract class AvgFunction extends Metrics implements AcceptableValue<Long> {
public abstract class AvgFunction extends Metrics implements AcceptableValue<Long>, LongValueHolder {
protected static final String SUMMATION = "summation";
protected static final String COUNT = "count";
protected static final String VALUE = "value";
......
......@@ -30,6 +30,7 @@ import org.apache.skywalking.oap.server.core.UnexpectedException;
import org.apache.skywalking.oap.server.core.analysis.manual.instance.InstanceTraffic;
import org.apache.skywalking.oap.server.core.analysis.meter.MeterEntity;
import org.apache.skywalking.oap.server.core.analysis.metrics.DataTable;
import org.apache.skywalking.oap.server.core.analysis.metrics.LabeledValueHolder;
import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics;
import org.apache.skywalking.oap.server.core.remote.grpc.proto.RemoteData;
import org.apache.skywalking.oap.server.core.storage.StorageBuilder;
......@@ -37,7 +38,7 @@ import org.apache.skywalking.oap.server.core.storage.annotation.Column;
@MeterFunction(functionName = "avgLabeled")
@ToString
public abstract class AvgLabeledFunction extends Metrics implements AcceptableValue<DataTable> {
public abstract class AvgLabeledFunction extends Metrics implements AcceptableValue<DataTable>, LabeledValueHolder {
protected static final String SUMMATION = "summation";
protected static final String COUNT = "count";
protected static final String VALUE = "value";
......
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.skywalking.oap.server.core.analysis.metrics;
/**
* LabeledValueHolder holds a list of key-value pair.
*/
public interface LabeledValueHolder {
DataTable getValue();
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册