未验证 提交 fc0d0e1e 编写于 作者: wu-sheng's avatar wu-sheng 提交者: GitHub

Message Queue performance and consuming latency monitoring (#7918)

* Support `!= null` in OAL engine.
* Add `Message Queue Consuming Count` metric for MQ consuming service and endpoint.
* Add `Message Queue Avg Consuming Latency` metric for MQ consuming service and endpoint.
* Document with a new menu in the `tracing` catalog is added
上级 f4c99db4
......@@ -27,6 +27,9 @@ Release Notes.
* Refactor the OAL compiler context to improve readability.
* Fix wrong generated codes of `hashCode` and `remoteHashCode` methods for numeric fields.
* Replace e2e cases to e2e-v2: Nodejs.
* Support `!= null` in OAL engine.
* Add `Message Queue Consuming Count` metric for MQ consuming service and endpoint.
* Add `Message Queue Avg Consuming Latency` metric for MQ consuming service and endpoint.
#### UI
......
......@@ -22,10 +22,11 @@ The core features are following.
- Root cause analysis. Profile the code on the runtime
- Service topology map analysis
- Service, service instance and endpoint dependency analysis
- Slow services and endpoints detected
- Slow services and endpoints detecting
- Performance optimization
- Distributed tracing and context propagation
- Database access metrics. Detect slow database access statements(including SQL statements)
- Message Queue performance and consuming latency monitoring
- Alarm
- Browser performance monitoring
- Infrastructure(VM, network, disk etc.) monitoring
......
# Message Queue performance and consuming latency monitoring
Message Queue server plays an important role in today's distributed system, in order to reduce the length and latency of
blocking RPC, and eventually improve user experience. But in this async way, the measure for queue consuming traffic and
latency becomes significant.
Since 8.9.0, SkyWalking leverages native tracing agent and [**Extension Header
Item** of SkyWalking Cross Process Propagation Headers Protocol v3](../../protocols/Skywalking-Cross-Process-Propagation-Headers-Protocol-v3.md#extension-header-item)
, to provide performance monitoring for Message Queue system.
In default, we provide `Message Queue Consuming Count` and `Message Queue Avg Consuming Latency` metrics for service and
endpoint levels.
More metrics could be added through `core.oal`.
......@@ -107,6 +107,8 @@ catalog:
path: "/en/setup/backend/trace-sampling"
- name: "Detect Slow Database Statement"
path: "/en/setup/backend/slow-db-statement"
- name: "Message Queue Performance"
path: "/en/setup/backend/mq"
- name: "Uninstrumented Gateways"
path: "/en/setup/backend/uninstrumented-gateways"
- name: "Metrics"
......
......@@ -264,6 +264,9 @@ public class MultiScopesAnalysisListener implements EntryAnalysisListener, ExitA
case Database:
sourceBuilder.setType(RequestType.DATABASE);
break;
case MQ:
sourceBuilder.setType(RequestType.MQ);
break;
default:
sourceBuilder.setType(RequestType.RPC);
break;
......
......@@ -78,6 +78,7 @@ CONTAIN: 'contain';
NOT_CONTAIN: 'not contain';
// Literals
NULL_LITERAL: 'null';
BOOL_LITERAL: 'true'
| 'false'
......
......@@ -121,7 +121,7 @@ numberMatch
;
stringMatch
: conditionAttributeStmt DUALEQUALS (stringConditionValue | enumConditionValue)
: conditionAttributeStmt DUALEQUALS (stringConditionValue | enumConditionValue | nullConditionValue)
;
greaterMatch
......@@ -145,7 +145,7 @@ booleanNotEqualMatch
;
notEqualMatch
: conditionAttributeStmt NOT_EQUAL (numberConditionValue | stringConditionValue | enumConditionValue)
: conditionAttributeStmt NOT_EQUAL (numberConditionValue | stringConditionValue | enumConditionValue | nullConditionValue)
;
likeMatch
......@@ -188,6 +188,10 @@ numberConditionValue
: NUMBER_LITERAL
;
nullConditionValue
: NULL_LITERAL
;
sourceAttrCast
: castStmt
;
......
......@@ -202,7 +202,7 @@ public class OALListener extends OALParserBaseListener {
@Override
public void enterEnumConditionValue(OALParser.EnumConditionValueContext ctx) {
enterConditionValue(ctx.getText());
enterEnumConditionValue(ctx.getText());
}
@Override
......@@ -211,19 +211,24 @@ public class OALListener extends OALParserBaseListener {
enterConditionValue(ctx.getText());
}
@Override
public void enterNullConditionValue(OALParser.NullConditionValueContext ctx) {
enterConditionValue(ctx.getText());
}
@Override
public void enterExpressionAttrCast(final OALParser.ExpressionAttrCastContext ctx) {
conditionExpression.setCastType(ctx.getText());
}
private void enterConditionValue(String value) {
if (value.split("\\.").length == 2 && !value.startsWith("\"")) {
// Value is an enum.
value = sourcePackage + value;
}
conditionExpression.addValue(value);
}
private void enterEnumConditionValue(String value) {
conditionExpression.addValue(sourcePackage + value);
}
/////////////
// Expression end.
////////////
......
......@@ -22,7 +22,6 @@ import java.util.List;
import java.util.Map;
import lombok.Getter;
import lombok.Setter;
import org.apache.skywalking.oap.server.core.Const;
import static org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.ALL;
......@@ -73,6 +72,6 @@ public class All extends Source {
private Map<String, String> originalTags;
public String getTag(String key) {
return originalTags.getOrDefault(key, Const.EMPTY_STRING);
return originalTags.get(key);
}
}
......@@ -24,7 +24,6 @@ import lombok.Getter;
import lombok.Setter;
import lombok.extern.slf4j.Slf4j;
import org.apache.skywalking.apm.util.StringUtil;
import org.apache.skywalking.oap.server.core.Const;
import org.apache.skywalking.oap.server.core.analysis.IDManager;
import org.apache.skywalking.oap.server.core.analysis.NodeType;
......@@ -100,6 +99,6 @@ public class Endpoint extends Source {
}
public String getTag(String key) {
return originalTags.getOrDefault(key, Const.EMPTY_STRING);
return originalTags.get(key);
}
}
......@@ -30,5 +30,6 @@ public enum RequestType {
* Logic request only.
*/
LOGIC,
TCP
TCP,
MQ
}
......@@ -22,7 +22,6 @@ import java.util.List;
import java.util.Map;
import lombok.Getter;
import lombok.Setter;
import org.apache.skywalking.oap.server.core.Const;
import org.apache.skywalking.oap.server.core.analysis.IDManager;
import org.apache.skywalking.oap.server.core.analysis.NodeType;
......@@ -92,6 +91,6 @@ public class Service extends Source {
private TCPInfo tcpInfo = new TCPInfo();
public String getTag(String key) {
return originalTags.getOrDefault(key, Const.EMPTY_STRING);
return originalTags.get(key);
}
}
......@@ -18,15 +18,13 @@
package org.apache.skywalking.oap.server.core.source;
import java.util.List;
import java.util.Map;
import lombok.Getter;
import lombok.Setter;
import org.apache.skywalking.oap.server.core.Const;
import org.apache.skywalking.oap.server.core.analysis.IDManager;
import org.apache.skywalking.oap.server.core.analysis.NodeType;
import java.util.List;
import static org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.SERVICE_INSTANCE;
import static org.apache.skywalking.oap.server.core.source.DefaultScopeDefine.SERVICE_INSTANCE_CATALOG_NAME;
......@@ -102,6 +100,6 @@ public class ServiceInstance extends Source {
}
public String getTag(String key) {
return originalTags.getOrDefault(key, Const.EMPTY_STRING);
return originalTags.get(key);
}
}
......@@ -19,6 +19,7 @@
package org.apache.skywalking.oap.server.receiver.trace.mock;
import io.grpc.stub.StreamObserver;
import org.apache.skywalking.apm.network.common.v3.KeyStringValuePair;
import org.apache.skywalking.apm.network.language.agent.v3.RefType;
import org.apache.skywalking.apm.network.language.agent.v3.SegmentObject;
import org.apache.skywalking.apm.network.language.agent.v3.SegmentReference;
......@@ -61,6 +62,7 @@ class ServiceCMock {
span.setIsError(false);
span.addRefs(createReference(traceId, parentSegmentId));
span.setOperationName(ServiceBMock.ROCKET_MQ_ENDPOINT);
span.addTags(KeyStringValuePair.newBuilder().setKey("transmission.latency").setValue("100").build());
return span;
}
......
......@@ -29,6 +29,8 @@ service_sla = from(Service.*).percent(status == true);
service_cpm = from(Service.*).cpm();
service_percentile = from(Service.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
service_apdex = from(Service.latency).apdex(name, status);
service_mq_consume_count = from(Service.*).filter(type == RequestType.MQ).count();
service_mq_consume_latency = from((str->long)Service.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();
// Service relation scope metrics for topology
service_relation_client_cpm = from(ServiceRelation.*).filter(detectPoint == DetectPoint.CLIENT).cpm();
......@@ -60,6 +62,8 @@ endpoint_cpm = from(Endpoint.*).cpm();
endpoint_avg = from(Endpoint.latency).longAvg();
endpoint_sla = from(Endpoint.*).percent(status == true);
endpoint_percentile = from(Endpoint.latency).percentile(10); // Multiple values including p50, p75, p90, p95, p99
endpoint_mq_consume_count = from(Endpoint.*).filter(type == RequestType.MQ).count();
endpoint_mq_consume_latency = from((str->long)Endpoint.tag["transmission.latency"]).filter(type == RequestType.MQ).filter(tag["transmission.latency"] != null).longAvg();
// Endpoint relation scope metrics
endpoint_relation_cpm = from(EndpointRelation.*).filter(detectPoint == DetectPoint.SERVER).cpm();
......
......@@ -121,9 +121,9 @@ templates:
"name": "Service",
"children": [
{
"width": 3,
"width": 4,
"title": "Service Apdex",
"height": "200",
"height": "100",
"entityType": "Service",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
......@@ -134,21 +134,9 @@ templates:
"aggregationNum": "10000"
},
{
"width": 3,
"title": "Service Avg Response Time",
"height": "200",
"entityType": "Service",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
"metricName": "service_resp_time",
"queryMetricType": "readMetricsValues",
"chartType": "ChartLine",
"unit": "ms"
},
{
"width": 3,
"width": 4,
"title": "Successful Rate",
"height": "200",
"height": "100",
"entityType": "Service",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
......@@ -160,9 +148,9 @@ templates:
"aggregationNum": "100"
},
{
"width": 3,
"width": 4,
"title": "Service Load",
"height": "200",
"height": "100",
"entityType": "Service",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
......@@ -172,6 +160,18 @@ templates:
"unit": "CPM / PPM",
"tips": "For HTTP 1/2, gRPC, RPC services, this means Calls Per Minute (CPM), for TCP services, this means Packets Per Minute (PPM)"
},
{
"width": 3,
"title": "Service Avg Response Time",
"height": "200",
"entityType": "Service",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
"metricName": "service_resp_time",
"queryMetricType": "readMetricsValues",
"chartType": "ChartLine",
"unit": "ms"
},
{
"width": 3,
"title": "Service Apdex",
......@@ -240,7 +240,33 @@ templates:
"tips": "This metrics is only avaible for TCP services"
},
{
"width": "3",
"width": 3,
"title": "Message Queue Consuming Count",
"height": "200",
"entityType": "Service",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
"metricName": "service_mq_consume_count",
"queryMetricType": "readMetricsValues",
"chartType": "ChartLine",
"unit": "",
"tips": "The number of consumed messages."
},
{
"width": 3,
"title": "Message Queue Avg Consuming Latency",
"height": "200",
"entityType": "Service",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
"metricName": "service_mq_consume_latency",
"queryMetricType": "readMetricsValues",
"chartType": "ChartLine",
"unit": "ms",
"tips": "The avg latency of message consuming. Latency = timestamp(received) - timestamp(producing)"
},
{
"width": "4",
"title": "Service Instances Load",
"height": "280",
"entityType": "ServiceInstance",
......@@ -254,7 +280,7 @@ templates:
"tips": "For HTTP 1/2, gRPC, RPC services, this means Calls Per Minute (CPM), for TCP services, this means Packets Per Minute (PPM)"
},
{
"width": "3",
"width": "4",
"title": "Slow Service Instance",
"height": "280",
"entityType": "ServiceInstance",
......@@ -267,7 +293,7 @@ templates:
"unit": "ms"
},
{
"width": "3",
"width": "4",
"title": "Service Instance Successful Rate",
"height": "280",
"entityType": "ServiceInstance",
......@@ -519,9 +545,9 @@ templates:
"sortOrder": "ASC"
},
{
"width": 3,
"width": 4,
"title": "Endpoint Load",
"height": 350,
"height": 200,
"entityType": "Endpoint",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
......@@ -530,9 +556,9 @@ templates:
"chartType": "ChartLine"
},
{
"width": 3,
"width": 4,
"title": "Endpoint Avg Response Time",
"height": 350,
"height": 200,
"entityType": "Endpoint",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
......@@ -542,9 +568,9 @@ templates:
"unit": "ms"
},
{
"width": 3,
"width": 4,
"title": "Endpoint Response Time Percentile",
"height": 350,
"height": 200,
"entityType": "Endpoint",
"independentSelector": false,
"metricType": "LABELED_VALUE",
......@@ -556,9 +582,9 @@ templates:
"unit": "ms"
},
{
"width": 3,
"width": 4,
"title": "Endpoint Successful Rate",
"height": 350,
"height": 200,
"entityType": "Endpoint",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
......@@ -568,6 +594,32 @@ templates:
"unit": "%",
"aggregation": "/",
"aggregationNum": "100"
},
{
"width": 4,
"title": "Message Queue Consuming Count",
"height": "200",
"entityType": "Endpoint",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
"metricName": "endpoint_mq_consume_count",
"queryMetricType": "readMetricsValues",
"chartType": "ChartLine",
"unit": "",
"tips": "The number of consumed messages."
},
{
"width": 4,
"title": "Message Queue Avg Consuming Latency",
"height": "200",
"entityType": "Endpoint",
"independentSelector": false,
"metricType": "REGULAR_VALUE",
"metricName": "endpoint_mq_consume_latency",
"queryMetricType": "readMetricsValues",
"chartType": "ChartLine",
"unit": "ms",
"tips": "The avg latency of message consuming. Latency = timestamp(received) - timestamp(producing)"
}
]
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册