未验证 提交 9b91519c 编写于 作者: W wei liu 提交者: GitHub

fix search cost metrics on replica selection (#25059)

Signed-off-by: NWei Liu <wei.liu@zilliz.com>
上级 b3362c29
......@@ -110,12 +110,14 @@ func (b *LookAsideBalancer) SelectNode(ctx context.Context, availableNodes []int
}
}
if targetNode != -1 {
// update executing task cost
totalNQ, _ := b.executingTaskTotalNQ.Get(targetNode)
totalNQ.Add(cost)
if targetNode == -1 {
return -1, merr.WrapErrNoAvailableNode("all available nodes are unreachable")
}
// update executing task cost
totalNQ, _ := b.executingTaskTotalNQ.Get(targetNode)
totalNQ.Add(cost)
return targetNode, nil
}
......@@ -137,14 +139,20 @@ func (b *LookAsideBalancer) UpdateCostMetrics(node int64, cost *internalpb.CostA
// calculateScore compute the query node's workload score
// https://www.usenix.org/conference/nsdi15/technical-sessions/presentation/suresh
func (b *LookAsideBalancer) calculateScore(cost *internalpb.CostAggregation, executingNQ int64) float64 {
if cost == nil || cost.ResponseTime == 0 {
return float64(executingNQ)
if cost == nil || cost.ResponseTime == 0 || cost.ServiceTime == 0 {
return math.Pow(float64(1+executingNQ), 3.0)
}
executeSpeed := float64(cost.ResponseTime) - float64(1)/float64(cost.ServiceTime)
workload := math.Pow(float64(1+cost.TotalNQ+executingNQ), 3.0) / float64(cost.ServiceTime)
if workload < 0.0 {
return math.MaxFloat64
}
return float64(cost.ResponseTime) - float64(1)/float64(cost.ServiceTime) + math.Pow(float64(1+cost.TotalNQ+executingNQ), 3.0)/float64(cost.ServiceTime)
return executeSpeed + workload
}
func (b *LookAsideBalancer) checkQueryNodeHealthLoop(ctx context.Context) {
log := log.Ctx(context.TODO()).WithRateGroup("proxy.LookAsideBalancer", 60, 1)
log := log.Ctx(ctx).WithRateGroup("proxy.LookAsideBalancer", 60, 1)
defer b.wg.Done()
ticker := time.NewTicker(checkQueryNodeHealthInterval)
......
......@@ -18,6 +18,7 @@ package proxy
import (
"context"
"math"
"testing"
"time"
......@@ -26,6 +27,7 @@ import (
"github.com/milvus-io/milvus-proto/go-api/v2/milvuspb"
"github.com/milvus-io/milvus/internal/proto/internalpb"
"github.com/milvus-io/milvus/internal/types"
"github.com/milvus-io/milvus/pkg/util/merr"
"github.com/stretchr/testify/mock"
"github.com/stretchr/testify/suite"
"go.uber.org/atomic"
......@@ -108,6 +110,16 @@ func (suite *LookAsideBalancerSuite) TestCalculateScore() {
suite.Equal(float64(176), score6)
suite.Equal(float64(352), score7)
suite.Equal(float64(220), score8)
// test score overflow
costMetrics5 := &internalpb.CostAggregation{
ResponseTime: 5,
ServiceTime: 1,
TotalNQ: math.MaxInt64,
}
score9 := suite.balancer.calculateScore(costMetrics5, math.MaxInt64)
suite.Equal(math.MaxFloat64, score9)
}
func (suite *LookAsideBalancerSuite) TestSelectNode() {
......@@ -283,7 +295,7 @@ func (suite *LookAsideBalancerSuite) TestCheckHealthLoop() {
return suite.balancer.unreachableQueryNodes.Contain(1)
}, 2*time.Second, 100*time.Millisecond)
targetNode, err := suite.balancer.SelectNode(context.Background(), []int64{1}, 1)
suite.NoError(err)
suite.ErrorIs(err, merr.ErrNoAvailableNode)
suite.Equal(int64(-1), targetNode)
suite.Eventually(func() bool {
......
......@@ -746,7 +746,7 @@ func (node *QueryNode) SearchSegments(ctx context.Context, req *querypb.SearchRe
result := task.Result()
if result.CostAggregation != nil {
// update channel's response time
result.CostAggregation.ResponseTime = int64(latency)
result.CostAggregation.ResponseTime = latency.Milliseconds()
}
return result, nil
}
......
......@@ -179,7 +179,6 @@ func (s *Scheduler) mergeTasks(t Task) {
merged := false
for _, task := range s.mergingSearchTasks {
if task.Merge(t) {
s.waitingTaskTotalNQ.Sub(t.nq)
merged = true
break
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册