From a68cec85afcfaa27b44c4f7de462bc225cf49269 Mon Sep 17 00:00:00 2001 From: yah01 Date: Thu, 10 Nov 2022 15:01:04 +0800 Subject: [PATCH] Refine QueryCoordV2 metrics (#20461) Signed-off-by: yah01 Signed-off-by: yah01 --- internal/metrics/querycoord_metrics.go | 27 +++---------------- internal/querycoordv2/job/job.go | 5 ++++ internal/querycoordv2/server.go | 3 +++ internal/querycoordv2/session/node_manager.go | 8 +++++- internal/querycoordv2/task/scheduler.go | 3 +++ 5 files changed, 22 insertions(+), 24 deletions(-) diff --git a/internal/metrics/querycoord_metrics.go b/internal/metrics/querycoord_metrics.go index 8400a628c..ca3241ac6 100644 --- a/internal/metrics/querycoord_metrics.go +++ b/internal/metrics/querycoord_metrics.go @@ -77,29 +77,12 @@ var ( Buckets: []float64{0, 5, 10, 20, 40, 100, 200, 400, 1000, 10000}, }, []string{}) - QueryCoordNumChildTasks = prometheus.NewGaugeVec( + QueryCoordTaskNum = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: milvusNamespace, Subsystem: typeutil.QueryCoordRole, - Name: "child_task_num", - Help: "number of child tasks in QueryCoord's queue", - }, []string{}) - - QueryCoordNumParentTasks = prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: milvusNamespace, - Subsystem: typeutil.QueryCoordRole, - Name: "parent_task_num", - Help: "number of parent tasks in QueryCoord's queue", - }, []string{}) - - QueryCoordChildTaskLatency = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Namespace: milvusNamespace, - Subsystem: typeutil.QueryCoordRole, - Name: "child_task_latency", - Help: "latency of child tasks", - Buckets: buckets, + Name: "task_num", + Help: "the number of tasks in QueryCoord's scheduler", }, []string{}) QueryCoordNumQueryNodes = prometheus.NewGaugeVec( @@ -119,8 +102,6 @@ func RegisterQueryCoord(registry *prometheus.Registry) { registry.MustRegister(QueryCoordReleaseCount) registry.MustRegister(QueryCoordLoadLatency) registry.MustRegister(QueryCoordReleaseLatency) - registry.MustRegister(QueryCoordNumChildTasks) - registry.MustRegister(QueryCoordNumParentTasks) - registry.MustRegister(QueryCoordChildTaskLatency) + registry.MustRegister(QueryCoordTaskNum) registry.MustRegister(QueryCoordNumQueryNodes) } diff --git a/internal/querycoordv2/job/job.go b/internal/querycoordv2/job/job.go index df812ed50..102222b19 100644 --- a/internal/querycoordv2/job/job.go +++ b/internal/querycoordv2/job/job.go @@ -25,6 +25,7 @@ import ( "go.uber.org/zap" "github.com/milvus-io/milvus/internal/log" + "github.com/milvus-io/milvus/internal/metrics" "github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/querycoordv2/meta" "github.com/milvus-io/milvus/internal/querycoordv2/session" @@ -227,6 +228,7 @@ func (job *LoadCollectionJob) Execute() error { return utils.WrapError(msg, err) } + metrics.QueryCoordNumCollections.WithLabelValues().Inc() return nil } @@ -285,6 +287,7 @@ func (job *ReleaseCollectionJob) Execute() error { job.targetMgr.RemoveCollection(req.GetCollectionID()) waitCollectionReleased(job.dist, req.GetCollectionID()) + metrics.QueryCoordNumCollections.WithLabelValues().Dec() return nil } @@ -422,6 +425,7 @@ func (job *LoadPartitionJob) Execute() error { return utils.WrapError(msg, err) } + metrics.QueryCoordNumCollections.WithLabelValues().Inc() return nil } @@ -510,5 +514,6 @@ func (job *ReleasePartitionJob) Execute() error { job.targetMgr.RemovePartition(req.GetCollectionID(), toRelease...) waitCollectionReleased(job.dist, req.GetCollectionID(), toRelease...) } + metrics.QueryCoordNumCollections.WithLabelValues().Dec() return nil } diff --git a/internal/querycoordv2/server.go b/internal/querycoordv2/server.go index f525c4fcc..7959f7779 100644 --- a/internal/querycoordv2/server.go +++ b/internal/querycoordv2/server.go @@ -27,6 +27,7 @@ import ( "syscall" "time" + "github.com/milvus-io/milvus/internal/metrics" "github.com/milvus-io/milvus/internal/util/timerecord" "github.com/milvus-io/milvus-proto/go-api/commonpb" @@ -253,6 +254,8 @@ func (s *Server) initMeta() error { log.Error("failed to recover collections") return err } + metrics.QueryCoordNumCollections.WithLabelValues().Set(float64(len(s.meta.GetAll()))) + err = s.meta.ReplicaManager.Recover() if err != nil { log.Error("failed to recover replicas") diff --git a/internal/querycoordv2/session/node_manager.go b/internal/querycoordv2/session/node_manager.go index 2f375b1e4..b6cc32943 100644 --- a/internal/querycoordv2/session/node_manager.go +++ b/internal/querycoordv2/session/node_manager.go @@ -16,7 +16,11 @@ package session -import "sync" +import ( + "sync" + + "github.com/milvus-io/milvus/internal/metrics" +) type Manager interface { Add(node *NodeInfo) @@ -34,12 +38,14 @@ func (m *NodeManager) Add(node *NodeInfo) { m.mu.Lock() defer m.mu.Unlock() m.nodes[node.ID()] = node + metrics.QueryCoordNumQueryNodes.WithLabelValues().Set(float64(len(m.nodes))) } func (m *NodeManager) Remove(nodeID int64) { m.mu.Lock() defer m.mu.Unlock() delete(m.nodes, nodeID) + metrics.QueryCoordNumQueryNodes.WithLabelValues().Set(float64(len(m.nodes))) } func (m *NodeManager) Get(nodeID int64) *NodeInfo { diff --git a/internal/querycoordv2/task/scheduler.go b/internal/querycoordv2/task/scheduler.go index df7c02dfc..8c1547835 100644 --- a/internal/querycoordv2/task/scheduler.go +++ b/internal/querycoordv2/task/scheduler.go @@ -25,6 +25,7 @@ import ( "go.uber.org/zap" "github.com/milvus-io/milvus/internal/log" + "github.com/milvus-io/milvus/internal/metrics" "github.com/milvus-io/milvus/internal/proto/datapb" "github.com/milvus-io/milvus/internal/proto/querypb" "github.com/milvus-io/milvus/internal/querycoordv2/meta" @@ -232,6 +233,7 @@ func (scheduler *taskScheduler) Add(task Task) error { log.Warn("failed to add task", zap.String("task", task.String())) return nil } + metrics.QueryCoordTaskNum.WithLabelValues().Set(float64(scheduler.tasks.Len())) log.Info("task added", zap.String("task", task.String())) return nil } @@ -582,6 +584,7 @@ func (scheduler *taskScheduler) remove(task Task) { log = log.With(zap.String("channel", task.Channel())) } + metrics.QueryCoordTaskNum.WithLabelValues().Set(float64(scheduler.tasks.Len())) log.Debug("task removed") } -- GitLab