提交 fff74012 编写于 作者: wu-sheng's avatar wu-sheng

Fix alarm issues.

上级 4559c49e
......@@ -28,6 +28,8 @@ import org.apache.skywalking.oap.server.core.alarm.AlarmCallback;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.joda.time.LocalDateTime;
import org.joda.time.Minutes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Alarm core includes metric values in certain time windows based on alarm settings. By using its internal timer
......@@ -36,6 +38,8 @@ import org.joda.time.Minutes;
* @author wusheng
*/
public class AlarmCore {
private static final Logger logger = LoggerFactory.getLogger(AlarmCore.class);
private Map<String, List<RunningRule>> runningContext;
private LocalDateTime lastExecuteTime;
......@@ -62,29 +66,31 @@ public class AlarmCore {
public void start(List<AlarmCallback> allCallbacks) {
LocalDateTime now = LocalDateTime.now();
lastExecuteTime = now;
runningContext.values().forEach(ruleList -> ruleList.forEach(runningRule -> runningRule.start(now)));
Executors.newSingleThreadScheduledExecutor().scheduleAtFixedRate(() -> {
List<AlarmMessage> alarmMessageList = new ArrayList<>(30);
runningContext.values().forEach(ruleList -> ruleList.forEach(runningRule -> {
LocalDateTime checkTime = LocalDateTime.now();
int minutes = Minutes.minutesBetween(lastExecuteTime, checkTime).getMinutes();
if (minutes > 0) {
runningRule.moveTo(checkTime);
/**
* Don't run in the first quarter per min, avoid to trigger alarm.
*/
if (checkTime.getSecondOfMinute() > 15) {
AlarmMessage alarmMessage = runningRule.check();
if (alarmMessage != AlarmMessage.NONE) {
alarmMessageList.add(alarmMessage);
try {
List<AlarmMessage> alarmMessageList = new ArrayList<>(30);
runningContext.values().forEach(ruleList -> ruleList.forEach(runningRule -> {
LocalDateTime checkTime = LocalDateTime.now();
int minutes = Minutes.minutesBetween(lastExecuteTime, checkTime).getMinutes();
if (minutes > 0) {
runningRule.moveTo(checkTime);
/**
* Don't run in the first quarter per min, avoid to trigger false alarm.
*/
if (checkTime.getSecondOfMinute() > 15) {
alarmMessageList.addAll(runningRule.check());
// Set the last execute time, and make sure the second is `00`, such as: 18:30:00
lastExecuteTime = checkTime.minusSeconds(checkTime.getSecondOfMinute());
}
// Set the last execute time, and make sure the second is `00`, such as: 18:30:00
lastExecuteTime = checkTime.minusSeconds(checkTime.getSecondOfMinute());
}
}
}));
}));
allCallbacks.forEach(callback -> callback.doAlarm(alarmMessageList));
if (alarmMessageList.size() > 0) {
allCallbacks.forEach(callback -> callback.doAlarm(alarmMessageList));
}
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
}, 10, 10, TimeUnit.SECONDS);
}
}
......@@ -18,6 +18,7 @@
package org.apache.skywalking.oap.server.core.alarm.provider;
import java.util.ArrayList;
import lombok.AccessLevel;
import lombok.Getter;
import lombok.Setter;
......@@ -31,6 +32,7 @@ public class AlarmRule {
private String alarmRuleName;
private String indicatorName;
private ArrayList includeNames;
private String threshold;
private String op;
private int period;
......
......@@ -58,7 +58,9 @@ public class RulesReader {
if (indicatorName == null) {
throw new IllegalArgumentException("indicator-name can't be null");
}
alarmRule.setIndicatorName((String)indicatorName);
alarmRule.setIncludeNames((ArrayList)settings.getOrDefault("include-names", new ArrayList(0)));
alarmRule.setThreshold(settings.get("threshold").toString());
alarmRule.setOp((String)settings.get("op"));
alarmRule.setPeriod((Integer)settings.getOrDefault("period", 1));
......
......@@ -18,10 +18,12 @@
package org.apache.skywalking.oap.server.core.alarm.provider;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.skywalking.oap.server.core.alarm.AlarmCallback;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm;
import org.apache.skywalking.oap.server.core.analysis.indicator.DoubleValueHolder;
......@@ -46,22 +48,27 @@ public class RunningRule {
private static DateTimeFormatter TIME_BUCKET_FORMATTER = DateTimeFormat.forPattern("yyyyMMddHHmm");
private String ruleName;
private int period;
private String indicatorName;
private final Threshold threshold;
private final OP op;
private final int countThreshold;
private final int silencePeriod;
private int counter;
private int silenceCountdown;
private Window window;
private volatile boolean isStarted = false;
private Map<MetaInAlarm, Window> windows;
private volatile IndicatorValueType valueType;
private Scope targetScope;
private List<String> includeNames;
public RunningRule(AlarmRule alarmRule) {
indicatorName = alarmRule.getIndicatorName();
this.ruleName = alarmRule.getAlarmRuleName();
// Init the empty window for alarming rule.
window = new Window(alarmRule.getPeriod());
windows = new ConcurrentHashMap<>();
period = alarmRule.getPeriod();
threshold = new Threshold(alarmRule.getAlarmRuleName(), alarmRule.getThreshold());
op = OP.get(alarmRule.getOp());
......@@ -70,6 +77,8 @@ public class RunningRule {
this.silencePeriod = alarmRule.getSilencePeriod();
// -1 means silence countdown is not running.
silenceCountdown = -1;
this.includeNames = alarmRule.getIncludeNames();
}
/**
......@@ -79,7 +88,8 @@ public class RunningRule {
* @param indicator
*/
public void in(MetaInAlarm meta, Indicator indicator) {
if (!isStarted) {
if (!meta.getIndicatorName().equals(indicatorName)) {
//Don't match rule, exit.
return;
}
......@@ -98,34 +108,44 @@ public class RunningRule {
}
if (valueType != null) {
Window window = windows.get(meta);
if (window == null) {
window = new Window(period);
Window ifAbsent = windows.putIfAbsent(meta, window);
if (ifAbsent == null) {
LocalDateTime timebucket = TIME_BUCKET_FORMATTER.parseLocalDateTime(indicator.getTimeBucket() + "");
window.moveTo(timebucket);
} else {
window = windows.get(meta);
}
}
window.add(indicator);
}
}
/**
* Start this rule in running mode.
*
* @param current
*/
public void start(LocalDateTime current) {
window.start(current);
isStarted = true;
}
/**
* Move the buffer window to give time.
*
* @param targetTime of moving target
*/
public void moveTo(LocalDateTime targetTime) {
window.moveTo(targetTime);
windows.values().forEach(window -> window.moveTo(targetTime));
}
/**
* Check the conditions, decide to whether trigger alarm.
*/
public AlarmMessage check() {
boolean isMatched = window.isMatch();
public List<AlarmMessage> check() {
List<AlarmMessage> alarmMessageList = new ArrayList<>(30);
windows.values().forEach(window -> {
AlarmMessage alarmMessage = window.checkAlarm();
if (alarmMessage != AlarmMessage.NONE) {
alarmMessageList.add(alarmMessage);
}
});
/**
* When
......@@ -133,10 +153,11 @@ public class RunningRule {
* 2. Counter reaches the count threshold;
* 3. Isn't in silence stage, judged by SilenceCountdown(!=0).
*/
if (isMatched) {
if (alarmMessageList.size() > 0) {
counter++;
if (counter >= countThreshold && silenceCountdown < 1) {
return triggerAlarm();
silenceCountdown = silencePeriod;
return alarmMessageList;
} else {
silenceCountdown--;
}
......@@ -146,16 +167,7 @@ public class RunningRule {
counter--;
}
}
return AlarmMessage.NONE;
}
/**
* Trigger alarm callbacks.
*/
private AlarmMessage triggerAlarm() {
silenceCountdown = silencePeriod;
AlarmMessage message = new AlarmMessage();
return message;
return new ArrayList<>(0);
}
/**
......@@ -176,27 +188,28 @@ public class RunningRule {
init();
}
public void start(LocalDateTime current) {
this.endTime = current;
}
public void moveTo(LocalDateTime current) {
lock.lock();
try {
int minutes = Minutes.minutesBetween(endTime, current).getMinutes();
if (minutes <= 0) {
return;
}
if (minutes > values.size()) {
// re-init
if (endTime == null) {
init();
endTime = current;
} else {
for (int i = 0; i < minutes; i++) {
values.removeFirst();
values.addLast(null);
int minutes = Minutes.minutesBetween(endTime, current).getMinutes();
if (minutes <= 0) {
return;
}
if (minutes > values.size()) {
// re-init
init();
} else {
for (int i = 0; i < minutes; i++) {
values.removeFirst();
values.addLast(null);
}
}
endTime = current;
}
endTime = current;
} finally {
lock.unlock();
}
......@@ -216,9 +229,8 @@ public class RunningRule {
lock.lock();
try {
if (minutes < 0) {
// At any moment, should NOT be here
// Add this code just because of my obsession :P
return;
moveTo(timebucket);
minutes = 0;
}
if (minutes >= values.size()) {
......@@ -233,7 +245,16 @@ public class RunningRule {
}
}
public boolean isMatch() {
public AlarmMessage checkAlarm() {
if (isMatch()) {
AlarmMessage message = new AlarmMessage();
return message;
} else {
return AlarmMessage.NONE;
}
}
private boolean isMatch() {
int matchCount = 0;
for (Indicator indicator : values) {
if (indicator == null) {
......
......@@ -65,7 +65,7 @@ public class AlarmCoreTest {
if (isAdd[0]) {
checkTime.add(LocalDateTime.now());
}
return null;
return new ArrayList<>(0);
}
}).when(mockRule).check();
......@@ -81,6 +81,9 @@ public class AlarmCoreTest {
Assert.assertTrue(checkTimePoints(checkTime));
break;
}
if(i == 9){
Assert.assertTrue(false);
}
}
}
......
......@@ -33,6 +33,9 @@ public class AlarmRuleInitTest {
Assert.assertEquals(2, ruleList.size());
Assert.assertEquals("85", ruleList.get(1).getThreshold());
Assert.assertEquals("endpoint_percent_rule", ruleList.get(0).getAlarmRuleName());
Assert.assertEquals(0, ruleList.get(0).getIncludeNames().size());
Assert.assertEquals("service_b", ruleList.get(1).getIncludeNames().get(1));
List<String> rulesWebhooks = rules.getWebhooks();
Assert.assertEquals(2, rulesWebhooks.size());
......
......@@ -20,6 +20,8 @@ package org.apache.skywalking.oap.server.core.alarm.provider;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.skywalking.oap.server.core.alarm.AlarmCallback;
import org.apache.skywalking.oap.server.core.alarm.AlarmMessage;
import org.apache.skywalking.oap.server.core.alarm.MetaInAlarm;
......@@ -55,10 +57,13 @@ public class RunningRuleTest {
alarmRule.setPeriod(15);
RunningRule runningRule = new RunningRule(alarmRule);
LocalDateTime startTime = TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301440");
runningRule.start(startTime);
LocalDateTime startTime = TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301434");
long timeInPeriod1 = 201808301434L;
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod1, 70));
Map<MetaInAlarm, RunningRule.Window> windows = Whitebox.getInternalState(runningRule, "windows");
RunningRule.Window window = Whitebox.getInternalState(runningRule, "window");
RunningRule.Window window = windows.get(getMetaInAlarm(123));
LocalDateTime endTime = Whitebox.getInternalState(window, "endTime");
int period = Whitebox.getInternalState(window, "period");
LinkedList<Indicator> indicatorBuffer = Whitebox.getInternalState(window, "values");
......@@ -81,23 +86,22 @@ public class RunningRuleTest {
RunningRule runningRule = new RunningRule(alarmRule);
LocalDateTime startTime = TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301440");
runningRule.start(startTime);
long timeInPeriod1 = 201808301434L;
long timeInPeriod2 = 201808301436L;
long timeInPeriod3 = 201808301438L;
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod2, 71));
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod3, 74));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod2, 71));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod3, 74));
// check at 201808301440
Assert.assertEquals(AlarmMessage.NONE, runningRule.check());
Assert.assertEquals(0, runningRule.check().size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
Assert.assertEquals(AlarmMessage.NONE, runningRule.check());
Assert.assertEquals(0, runningRule.check().size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
// check at 201808301442
Assert.assertNotEquals(AlarmMessage.NONE, runningRule.check());
Assert.assertEquals(1, runningRule.check().size());
}
......@@ -123,27 +127,26 @@ public class RunningRuleTest {
};
LinkedList<AlarmCallback> callbackList = new LinkedList<>();
callbackList.add(assertCallback);
runningRule.start(startTime);
long timeInPeriod1 = 201808301434L;
long timeInPeriod2 = 201808301436L;
long timeInPeriod3 = 201808301438L;
long timeInPeriod4 = 201808301432L;
long timeInPeriod5 = 201808301440L;
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod2, 71));
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod3, 74));
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod4, 90));
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod5, 95));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod2, 71));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod3, 74));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod4, 90));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod5, 95));
// check at 201808301440
Assert.assertEquals(AlarmMessage.NONE, runningRule.check());
Assert.assertEquals(0, runningRule.check().size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
// check at 201808301441
Assert.assertEquals(AlarmMessage.NONE, runningRule.check());
Assert.assertEquals(0, runningRule.check().size());
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301443"));
// check at 201808301442
Assert.assertEquals(AlarmMessage.NONE, runningRule.check());
Assert.assertEquals(0, runningRule.check().size());
}
@Test
......@@ -158,42 +161,38 @@ public class RunningRuleTest {
alarmRule.setSilencePeriod(2);
RunningRule runningRule = new RunningRule(alarmRule);
LocalDateTime startTime = TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301440");
runningRule.start(startTime);
long timeInPeriod1 = 201808301434L;
long timeInPeriod2 = 201808301436L;
long timeInPeriod3 = 201808301438L;
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod2, 71));
runningRule.in(getMetaInAlarm(), getIndicator(timeInPeriod3, 74));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod1, 70));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod2, 71));
runningRule.in(getMetaInAlarm(123), getIndicator(timeInPeriod3, 74));
// check at 201808301440
Assert.assertEquals(AlarmMessage.NONE, runningRule.check()); //check matches, no alarm
Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301441"));
// check at 201808301441
Assert.assertEquals(AlarmMessage.NONE, runningRule.check()); //check matches, no alarm
Assert.assertEquals(0, runningRule.check().size()); //check matches, no alarm
runningRule.moveTo(TIME_BUCKET_FORMATTER.parseLocalDateTime("201808301442"));
// check at 201808301442
Assert.assertNotEquals(AlarmMessage.NONE, runningRule.check()); //alarm
Assert.assertEquals(AlarmMessage.NONE, runningRule.check()); //silence, no alarm
Assert.assertEquals(AlarmMessage.NONE, runningRule.check()); //silence, no alarm
runningRule.check(); //alarm
Assert.assertEquals(AlarmMessage.NONE, runningRule.check()); //silence, no alarm
Assert.assertEquals(AlarmMessage.NONE, runningRule.check()); //silence, no alarm
Assert.assertNotEquals(AlarmMessage.NONE, runningRule.check()); //alarm
Assert.assertNotEquals(0, runningRule.check().size()); //alarm
Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
Assert.assertNotEquals(0, runningRule.check().size()); //alarm
Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
Assert.assertEquals(0, runningRule.check().size()); //silence, no alarm
Assert.assertNotEquals(0, runningRule.check().size()); //alarm
}
private MetaInAlarm getMetaInAlarm() {
private MetaInAlarm getMetaInAlarm(int id) {
return new MetaInAlarm() {
@Override public Scope getScope() {
return Scope.Service;
}
@Override public String getName() {
return "Service_123";
return "Service_" + id;
}
@Override public String getIndicatorName() {
......@@ -201,12 +200,21 @@ public class RunningRuleTest {
}
@Override public int getId0() {
return 123;
return id;
}
@Override public int getId1() {
return 0;
}
@Override public boolean equals(Object o) {
MetaInAlarm target = (MetaInAlarm)o;
return id == target.getId0();
}
@Override public int hashCode() {
return Objects.hash(id);
}
};
}
......
......@@ -29,6 +29,10 @@ rules:
silence-period: 10
service_percent_rule:
indicator-name: service_percent
# [Optional] Default, match all services in this indicator
include-names:
- service_a
- service_b
threshold: 85
op: <
period: 10
......
......@@ -18,6 +18,7 @@
package org.apache.skywalking.oap.server.core.alarm;
import java.util.Objects;
import lombok.Getter;
import lombok.Setter;
import org.apache.skywalking.oap.server.core.source.Scope;
......@@ -35,4 +36,18 @@ public class AlarmMeta {
this.scope = scope;
this.ids = new ScopeIDs(ids);
}
@Override public boolean equals(Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
AlarmMeta meta = (AlarmMeta)o;
return Objects.equals(ids, meta.ids);
}
@Override public int hashCode() {
return Objects.hash(ids);
}
}
......@@ -18,31 +18,41 @@
package org.apache.skywalking.oap.server.core.alarm;
import java.util.Objects;
import org.apache.skywalking.oap.server.core.source.Scope;
public interface MetaInAlarm {
Scope getScope();
public abstract class MetaInAlarm {
public abstract Scope getScope();
String getName();
public abstract String getName();
String getIndicatorName();
public abstract String getIndicatorName();
/**
* In most scopes, there is only id0, as primary id. Such as Service, Endpoint.
* But in relation, the ID includes two, actually.
* Such as ServiceRelation,
* id0 represents the source service id
* In most scopes, there is only id0, as primary id. Such as Service, Endpoint. But in relation, the ID includes
* two, actually. Such as ServiceRelation, id0 represents the source service id
*
* @return the primary id.
*/
int getId0();
public abstract int getId0();
/**
* Only exist in multiple IDs case,
* Such as ServiceRelation,
* id1 represents the dest service id
* Only exist in multiple IDs case, Such as ServiceRelation, id1 represents the dest service id
*
* @return
*/
int getId1();
public abstract int getId1();
@Override public boolean equals(Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
MetaInAlarm that = (MetaInAlarm)o;
return getId0() == that.getId0() && getId1() == that.getId1();
}
@Override public int hashCode() {
return Objects.hash(getId0(), getId1());
}
}
......@@ -18,6 +18,8 @@
package org.apache.skywalking.oap.server.core.alarm;
import java.util.Arrays;
/**
* Scope IDs represent IDs of this scope.
* Such as:
......@@ -34,4 +36,17 @@ public class ScopeIDs {
public int getID(int idx) {
return ids[idx];
}
@Override public boolean equals(Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
ScopeIDs ds = (ScopeIDs)o;
return Arrays.equals(ids, ds.ids);
}
@Override public int hashCode() {
return Arrays.hashCode(ids);
}
}
......@@ -25,7 +25,7 @@ import org.apache.skywalking.oap.server.core.source.Scope;
@Getter(AccessLevel.PUBLIC)
@Setter(AccessLevel.PUBLIC)
public class ServiceMetaInAlarm implements MetaInAlarm {
public class ServiceMetaInAlarm extends MetaInAlarm {
private String indicatorName;
private int id;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册