diff --git a/CHANGES.md b/CHANGES.md index e22c38194667772f736d7c6dcc3c6e0107009b0a..ea93875372a943348a01e8329dc795749c3d2450 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -54,6 +54,7 @@ Release Notes. * Include events of the entity(s) in the alarm. * Support `native-json` format log in kafka-fetcher-plugin. * Fix counter misuse in the alarm core. Alarm can't be triggered in time. +* Events can be configured as alarm source. #### UI * Add logo for kong plugin. diff --git a/docs/en/concepts-and-designs/event.md b/docs/en/concepts-and-designs/event.md index e73d90b1aec0c844a46132e0d424c9cbad55215e..940a64109db1666932367794fd5f1a1a59a474a8 100644 --- a/docs/en/concepts-and-designs/event.md +++ b/docs/en/concepts-and-designs/event.md @@ -55,10 +55,56 @@ The end time of the event. This field may be empty if the event has not ended ye **NOTE:** When reporting an event, you typically call the report function twice, the first time for starting of the event and the second time for ending of the event, both with the same UUID. There are also cases where you would already have both the start time and end time. For example, when exporting events from a third-party system, the start time and end time are already known so you may simply call the report function once. +## How to Configure Alarms for Events + +Events are derived from metrics, and can be the source to trigger alarms. For example, if a specific event occurs for a +certain times in a period, alarms can be triggered and sent. + +Every event has a default `value = 1`, when `n` events with the same name are reported, they are aggregated +into `value = n` as follows. + +``` +Event{name=Unhealthy, source={service=A,instance=a}, ...} +Event{name=Unhealthy, source={service=A,instance=a}, ...} +Event{name=Unhealthy, source={service=A,instance=a}, ...} +Event{name=Unhealthy, source={service=A,instance=a}, ...} +Event{name=Unhealthy, source={service=A,instance=a}, ...} +Event{name=Unhealthy, source={service=A,instance=a}, ...} +``` + +will be aggregated into + +``` +Event{name=Unhealthy, source={service=A,instance=a}, ...} +``` + +so you can configure the following alarm rule to trigger alarm when `Unhealthy` event occurs more than 5 times within 10 +minutes. + +```yaml +rules: + unhealthy_event_rule: + metrics-name: Unhealthy + # Healthiness check is usually a scheduled task, + # they may be unhealthy for the first few times, + # and can be unhealthy occasionally due to network jitter, + # please adjust the threshold as per your actual situation. + threshold: 5 + op: ">" + period: 10 + count: 1 + message: Service instance has been unhealthy for 10 minutes +``` + +For more alarm configuration details, please refer to the [alarm doc](../setup/backend/backend-alarm.md). + +**Note** that the `Unhealthy` event above is only for demonstration, they are not detected by default in SkyWalking, +however, you can use the methods in [How to Report Events](#how-to-report-events) to report this kind of events. + ## Known Events | Name | Type | When | | :----: | :----: | :-----| | Start | Normal | When your Java Application starts with SkyWalking Agent installed, the `Start` Event will be created. | | Shutdown | Normal | When your Java Application stops with SkyWalking Agent installed, the `Shutdown` Event will be created. | -| Alarm | Error | When the Alarm is triggered, the corresponding `Alarm` Event will is created. | \ No newline at end of file +| Alarm | Error | When the Alarm is triggered, the corresponding `Alarm` Event will is created. | diff --git a/oap-server/server-bootstrap/src/main/resources/alarm-settings.yml b/oap-server/server-bootstrap/src/main/resources/alarm-settings.yml index 0efbe267ce2c3453ca215e97eb15bfc7a622db47..a255ef414c666a4695d6f2aaeb91f9db10c3a553 100755 --- a/oap-server/server-bootstrap/src/main/resources/alarm-settings.yml +++ b/oap-server/server-bootstrap/src/main/resources/alarm-settings.yml @@ -40,6 +40,19 @@ rules: count: 1 tags: level: WARNING +# unhealthy_event_rule: +# metrics-name: Unhealthy + # Healthiness check is usually a scheduled task, + # they may be unhealthy for the first few times, + # and can be unhealthy occasionally due to network jitter, + # please adjust the threshold as per your actual situation. +# threshold: 5 +# op: ">" +# period: 10 +# count: 1 +# message: Service instance has been unhealthy for 10 minutes +# tags: +# level: ERROR webhooks: # - http://127.0.0.1/notify/ diff --git a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/event/Event.java b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/event/Event.java index 70655895112d8142db499bde135a4c8131737be2..37269be5ba1f81121da872a26147028793f711f7 100644 --- a/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/event/Event.java +++ b/oap-server/server-core/src/main/java/org/apache/skywalking/oap/server/core/event/Event.java @@ -24,15 +24,21 @@ import lombok.EqualsAndHashCode; import lombok.Getter; import lombok.Setter; import org.apache.skywalking.apm.util.StringUtil; +import org.apache.skywalking.oap.server.core.analysis.IDManager; import org.apache.skywalking.oap.server.core.analysis.MetricsExtension; import org.apache.skywalking.oap.server.core.analysis.Stream; import org.apache.skywalking.oap.server.core.analysis.TimeBucket; +import org.apache.skywalking.oap.server.core.analysis.metrics.LongValueHolder; import org.apache.skywalking.oap.server.core.analysis.metrics.Metrics; +import org.apache.skywalking.oap.server.core.analysis.metrics.MetricsMetaInfo; +import org.apache.skywalking.oap.server.core.analysis.metrics.WithMetadata; import org.apache.skywalking.oap.server.core.analysis.worker.MetricsStreamProcessor; import org.apache.skywalking.oap.server.core.remote.grpc.proto.RemoteData; +import org.apache.skywalking.oap.server.core.source.DefaultScopeDefine; import org.apache.skywalking.oap.server.core.source.ScopeDeclaration; import org.apache.skywalking.oap.server.core.storage.StorageHashMapBuilder; import org.apache.skywalking.oap.server.core.storage.annotation.Column; +import org.elasticsearch.common.Strings; import static org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.EVENT; @@ -45,7 +51,7 @@ import static org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.EV of = "uuid" ) @MetricsExtension(supportDownSampling = false, supportUpdate = true) -public class Event extends Metrics { +public class Event extends Metrics implements WithMetadata, LongValueHolder { public static final String INDEX_NAME = "events"; @@ -104,10 +110,14 @@ public class Event extends Metrics { @Column(columnName = END_TIME) private long endTime; + private transient long value = 1; + @Override public boolean combine(final Metrics metrics) { final Event event = (Event) metrics; + value++; + // Set time bucket only when it's never set. if (getTimeBucket() <= 0) { if (event.getStartTime() > 0) { @@ -193,6 +203,21 @@ public class Event extends Metrics { return hashCode(); } + @Override + public MetricsMetaInfo getMeta() { + int scope = DefaultScopeDefine.SERVICE; + final String serviceId = IDManager.ServiceID.buildId(getService(), true); + String id = serviceId; + if (!Strings.isNullOrEmpty(getServiceInstance())) { + scope = DefaultScopeDefine.SERVICE_INSTANCE; + id = IDManager.ServiceInstanceID.buildId(serviceId, getServiceInstance()); + } else if (!Strings.isNullOrEmpty(getEndpoint())) { + scope = DefaultScopeDefine.ENDPOINT; + id = IDManager.EndpointID.buildId(serviceId, getEndpoint()); + } + return new MetricsMetaInfo(getName(), scope, id); + } + public static class Builder implements StorageHashMapBuilder { @Override public Map entity2Storage(Event storageData) {