impl.go 27.7 KB
Newer Older
1 2 3 4 5 6
// Licensed to the LF AI & Data foundation under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
7 8
// with the License. You may obtain a copy of the License at
//
9
//     http://www.apache.org/licenses/LICENSE-2.0
10
//
11 12 13 14 15
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
16

17 18 19 20
package querynode

import (
	"context"
21
	"errors"
22
	"fmt"
23
	"sync"
24 25 26

	"go.uber.org/zap"

27
	"github.com/milvus-io/milvus/internal/common"
X
Xiangyu Wang 已提交
28 29 30 31
	"github.com/milvus-io/milvus/internal/log"
	"github.com/milvus-io/milvus/internal/proto/commonpb"
	"github.com/milvus-io/milvus/internal/proto/internalpb"
	"github.com/milvus-io/milvus/internal/proto/milvuspb"
32
	"github.com/milvus-io/milvus/internal/proto/querypb"
X
Xiangyu Wang 已提交
33
	queryPb "github.com/milvus-io/milvus/internal/proto/querypb"
34
	"github.com/milvus-io/milvus/internal/util/metricsinfo"
X
Xiangyu Wang 已提交
35
	"github.com/milvus-io/milvus/internal/util/typeutil"
36 37
)

38
// GetComponentStates returns information about whether the node is healthy
39 40 41 42 43 44
func (node *QueryNode) GetComponentStates(ctx context.Context) (*internalpb.ComponentStates, error) {
	stats := &internalpb.ComponentStates{
		Status: &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_Success,
		},
	}
45 46 47
	code, ok := node.stateCode.Load().(internalpb.StateCode)
	if !ok {
		errMsg := "unexpected error in type assertion"
48 49
		stats.Status = &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
50
			Reason:    errMsg,
51
		}
G
godchen 已提交
52
		return stats, nil
53 54 55 56
	}
	nodeID := common.NotRegisteredID
	if node.session != nil && node.session.Registered() {
		nodeID = node.session.ServerID
57 58
	}
	info := &internalpb.ComponentInfo{
59
		NodeID:    nodeID,
60 61 62 63
		Role:      typeutil.QueryNodeRole,
		StateCode: code,
	}
	stats.State = info
64
	log.Debug("Get QueryNode component state done", zap.Any("stateCode", info.StateCode))
65 66 67
	return stats, nil
}

68 69
// GetTimeTickChannel returns the time tick channel
// TimeTickChannel contains many time tick messages, which will be sent by query nodes
70 71 72 73 74 75
func (node *QueryNode) GetTimeTickChannel(ctx context.Context) (*milvuspb.StringResponse, error) {
	return &milvuspb.StringResponse{
		Status: &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_Success,
			Reason:    "",
		},
76
		Value: Params.CommonCfg.QueryCoordTimeTick,
77 78 79
	}, nil
}

80
// GetStatisticsChannel returns the statistics channel
81
// Statistics channel contains statistics infos of query nodes, such as segment infos, memory infos
82 83 84 85 86 87
func (node *QueryNode) GetStatisticsChannel(ctx context.Context) (*milvuspb.StringResponse, error) {
	return &milvuspb.StringResponse{
		Status: &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_Success,
			Reason:    "",
		},
88
		Value: Params.CommonCfg.QueryNodeStats,
89 90 91
	}, nil
}

92
// AddQueryChannel watch queryChannel of the collection to receive query message
93
func (node *QueryNode) AddQueryChannel(ctx context.Context, in *queryPb.AddQueryChannelRequest) (*commonpb.Status, error) {
94 95
	code := node.stateCode.Load().(internalpb.StateCode)
	if code != internalpb.StateCode_Healthy {
X
Xiaofan 已提交
96
		err := fmt.Errorf("query node %d is not ready", Params.QueryNodeCfg.GetNodeID())
97 98 99 100
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
G
godchen 已提交
101
		return status, nil
102
	}
103 104 105 106 107 108 109
	dct := &addQueryChannelTask{
		baseTask: baseTask{
			ctx:  ctx,
			done: make(chan error),
		},
		req:  in,
		node: node,
110
	}
111

112
	err := node.scheduler.queue.Enqueue(dct)
113 114 115 116
	if err != nil {
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
117
		}
X
Xiaofan 已提交
118
		log.Warn(err.Error())
G
godchen 已提交
119
		return status, nil
120
	}
X
Xiaofan 已提交
121
	log.Info("addQueryChannelTask Enqueue done",
122 123 124 125
		zap.Int64("collectionID", in.CollectionID),
		zap.String("queryChannel", in.QueryChannel),
		zap.String("queryResultChannel", in.QueryResultChannel),
	)
126

127 128 129 130 131 132
	waitFunc := func() (*commonpb.Status, error) {
		err = dct.WaitToFinish()
		if err != nil {
			status := &commonpb.Status{
				ErrorCode: commonpb.ErrorCode_UnexpectedError,
				Reason:    err.Error(),
133
			}
X
Xiaofan 已提交
134
			log.Warn(err.Error())
G
godchen 已提交
135
			return status, nil
136
		}
X
Xiaofan 已提交
137
		log.Info("addQueryChannelTask WaitToFinish done",
138 139 140 141 142
			zap.Int64("collectionID", in.CollectionID),
			zap.String("queryChannel", in.QueryChannel),
			zap.String("queryResultChannel", in.QueryResultChannel),
		)

143 144 145
		return &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_Success,
		}, nil
146
	}
147

148
	return waitFunc()
149 150
}

151
// RemoveQueryChannel remove queryChannel of the collection to stop receiving query message
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
func (node *QueryNode) RemoveQueryChannel(ctx context.Context, in *queryPb.RemoveQueryChannelRequest) (*commonpb.Status, error) {
	// if node.searchService == nil || node.searchService.searchMsgStream == nil {
	// 	errMsg := "null search service or null search result message stream"
	// 	status := &commonpb.Status{
	// 		ErrorCode: commonpb.ErrorCode_UnexpectedError,
	// 		Reason:    errMsg,
	// 	}

	// 	return status, errors.New(errMsg)
	// }

	// searchStream, ok := node.searchService.searchMsgStream.(*pulsarms.PulsarMsgStream)
	// if !ok {
	// 	errMsg := "type assertion failed for search message stream"
	// 	status := &commonpb.Status{
	// 		ErrorCode: commonpb.ErrorCode_UnexpectedError,
	// 		Reason:    errMsg,
	// 	}

	// 	return status, errors.New(errMsg)
	// }

	// resultStream, ok := node.searchService.searchResultMsgStream.(*pulsarms.PulsarMsgStream)
	// if !ok {
	// 	errMsg := "type assertion failed for search result message stream"
	// 	status := &commonpb.Status{
	// 		ErrorCode: commonpb.ErrorCode_UnexpectedError,
	// 		Reason:    errMsg,
	// 	}

	// 	return status, errors.New(errMsg)
	// }

	// // remove request channel
	// consumeChannels := []string{in.RequestChannelID}
	// consumeSubName := Params.MsgChannelSubName
	// // TODO: searchStream.RemovePulsarConsumers(producerChannels)
	// searchStream.AsConsumer(consumeChannels, consumeSubName)

	// // remove result channel
	// producerChannels := []string{in.ResultChannelID}
	// // TODO: resultStream.RemovePulsarProducer(producerChannels)
	// resultStream.AsProducer(producerChannels)

	status := &commonpb.Status{
		ErrorCode: commonpb.ErrorCode_Success,
	}
	return status, nil
}

G
godchen 已提交
202
// WatchDmChannels create consumers on dmChannels to receive Incremental data,which is the important part of real-time query
203
func (node *QueryNode) WatchDmChannels(ctx context.Context, in *queryPb.WatchDmChannelsRequest) (*commonpb.Status, error) {
204 205
	code := node.stateCode.Load().(internalpb.StateCode)
	if code != internalpb.StateCode_Healthy {
X
Xiaofan 已提交
206
		err := fmt.Errorf("query node %d is not ready", Params.QueryNodeCfg.GetNodeID())
207 208 209 210
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
G
godchen 已提交
211
		return status, nil
212
	}
213 214 215 216 217 218 219
	dct := &watchDmChannelsTask{
		baseTask: baseTask{
			ctx:  ctx,
			done: make(chan error),
		},
		req:  in,
		node: node,
220 221
	}

222 223
	err := node.scheduler.queue.Enqueue(dct)
	if err != nil {
224 225
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
226
			Reason:    err.Error(),
227
		}
X
Xiaofan 已提交
228
		log.Warn(err.Error())
G
godchen 已提交
229
		return status, nil
230
	}
X
Xiaofan 已提交
231
	log.Info("watchDmChannelsTask Enqueue done", zap.Int64("collectionID", in.CollectionID), zap.Int64("nodeID", Params.QueryNodeCfg.GetNodeID()), zap.Int64("replicaID", in.GetReplicaID()))
232
	waitFunc := func() (*commonpb.Status, error) {
233
		err = dct.WaitToFinish()
234
		if err != nil {
235 236 237 238
			status := &commonpb.Status{
				ErrorCode: commonpb.ErrorCode_UnexpectedError,
				Reason:    err.Error(),
			}
X
Xiaofan 已提交
239
			log.Warn(err.Error())
G
godchen 已提交
240
			return status, nil
241
		}
X
Xiaofan 已提交
242
		log.Info("watchDmChannelsTask WaitToFinish done", zap.Int64("collectionID", in.CollectionID), zap.Int64("nodeID", Params.QueryNodeCfg.GetNodeID()))
243 244 245
		return &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_Success,
		}, nil
246
	}
247 248

	return waitFunc()
249 250
}

G
godchen 已提交
251
// WatchDeltaChannels create consumers on dmChannels to receive Incremental data,which is the important part of real-time query
252
func (node *QueryNode) WatchDeltaChannels(ctx context.Context, in *queryPb.WatchDeltaChannelsRequest) (*commonpb.Status, error) {
253 254
	code := node.stateCode.Load().(internalpb.StateCode)
	if code != internalpb.StateCode_Healthy {
X
Xiaofan 已提交
255
		err := fmt.Errorf("query node %d is not ready", Params.QueryNodeCfg.GetNodeID())
256 257 258 259
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
G
godchen 已提交
260
		return status, nil
261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276
	}
	dct := &watchDeltaChannelsTask{
		baseTask: baseTask{
			ctx:  ctx,
			done: make(chan error),
		},
		req:  in,
		node: node,
	}

	err := node.scheduler.queue.Enqueue(dct)
	if err != nil {
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
X
Xiaofan 已提交
277
		log.Warn(err.Error())
G
godchen 已提交
278
		return status, nil
279
	}
X
Xiaofan 已提交
280 281

	log.Info("watchDeltaChannelsTask Enqueue done", zap.Int64("collectionID", in.CollectionID), zap.Int64("nodeID", Params.QueryNodeCfg.GetNodeID()))
282 283 284 285 286 287 288 289

	waitFunc := func() (*commonpb.Status, error) {
		err = dct.WaitToFinish()
		if err != nil {
			status := &commonpb.Status{
				ErrorCode: commonpb.ErrorCode_UnexpectedError,
				Reason:    err.Error(),
			}
X
Xiaofan 已提交
290
			log.Warn(err.Error())
G
godchen 已提交
291
			return status, nil
292
		}
X
Xiaofan 已提交
293 294

		log.Info("watchDeltaChannelsTask WaitToFinish done", zap.Int64("collectionID", in.CollectionID), zap.Int64("nodeID", Params.QueryNodeCfg.GetNodeID()))
295 296 297 298 299 300
		return &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_Success,
		}, nil
	}

	return waitFunc()
301 302
}

303
// LoadSegments load historical data into query node, historical data can be vector data or index
304
func (node *QueryNode) LoadSegments(ctx context.Context, in *queryPb.LoadSegmentsRequest) (*commonpb.Status, error) {
305 306
	code := node.stateCode.Load().(internalpb.StateCode)
	if code != internalpb.StateCode_Healthy {
X
Xiaofan 已提交
307
		err := fmt.Errorf("query node %d is not ready", Params.QueryNodeCfg.GetNodeID())
308 309 310 311
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
G
godchen 已提交
312
		return status, nil
313
	}
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
	dct := &loadSegmentsTask{
		baseTask: baseTask{
			ctx:  ctx,
			done: make(chan error),
		},
		req:  in,
		node: node,
	}

	err := node.scheduler.queue.Enqueue(dct)
	if err != nil {
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
X
Xiaofan 已提交
329
		log.Warn(err.Error())
G
godchen 已提交
330
		return status, nil
331
	}
332 333 334 335
	segmentIDs := make([]UniqueID, 0)
	for _, info := range in.Infos {
		segmentIDs = append(segmentIDs, info.SegmentID)
	}
X
Xiaofan 已提交
336
	log.Info("loadSegmentsTask Enqueue done", zap.Int64("collectionID", in.CollectionID), zap.Int64s("segmentIDs", segmentIDs), zap.Int64("nodeID", Params.QueryNodeCfg.GetNodeID()))
337

338
	waitFunc := func() (*commonpb.Status, error) {
339 340
		err = dct.WaitToFinish()
		if err != nil {
341 342 343 344
			status := &commonpb.Status{
				ErrorCode: commonpb.ErrorCode_UnexpectedError,
				Reason:    err.Error(),
			}
X
Xiaofan 已提交
345
			log.Warn(err.Error())
G
godchen 已提交
346
			return status, nil
347
		}
X
Xiaofan 已提交
348
		log.Info("loadSegmentsTask WaitToFinish done", zap.Int64("collectionID", in.CollectionID), zap.Int64s("segmentIDs", segmentIDs), zap.Int64("nodeID", Params.QueryNodeCfg.GetNodeID()))
349 350 351
		return &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_Success,
		}, nil
352
	}
353 354

	return waitFunc()
355 356
}

G
godchen 已提交
357
// ReleaseCollection clears all data related to this collection on the querynode
358
func (node *QueryNode) ReleaseCollection(ctx context.Context, in *queryPb.ReleaseCollectionRequest) (*commonpb.Status, error) {
359 360
	code := node.stateCode.Load().(internalpb.StateCode)
	if code != internalpb.StateCode_Healthy {
X
Xiaofan 已提交
361
		err := fmt.Errorf("query node %d is not ready", Params.QueryNodeCfg.GetNodeID())
362 363 364 365
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
G
godchen 已提交
366
		return status, nil
367
	}
368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
	dct := &releaseCollectionTask{
		baseTask: baseTask{
			ctx:  ctx,
			done: make(chan error),
		},
		req:  in,
		node: node,
	}

	err := node.scheduler.queue.Enqueue(dct)
	if err != nil {
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
X
Xiaofan 已提交
383
		log.Warn(err.Error())
G
godchen 已提交
384
		return status, nil
385
	}
X
Xiaofan 已提交
386
	log.Info("releaseCollectionTask Enqueue done", zap.Int64("collectionID", in.CollectionID))
387

388
	func() {
389 390
		err = dct.WaitToFinish()
		if err != nil {
X
Xiaofan 已提交
391
			log.Warn(err.Error())
392
			return
393
		}
X
Xiaofan 已提交
394
		log.Info("releaseCollectionTask WaitToFinish done", zap.Int64("collectionID", in.CollectionID))
395
	}()
396 397 398 399 400 401 402

	status := &commonpb.Status{
		ErrorCode: commonpb.ErrorCode_Success,
	}
	return status, nil
}

403
// ReleasePartitions clears all data related to this partition on the querynode
404
func (node *QueryNode) ReleasePartitions(ctx context.Context, in *queryPb.ReleasePartitionsRequest) (*commonpb.Status, error) {
405 406
	code := node.stateCode.Load().(internalpb.StateCode)
	if code != internalpb.StateCode_Healthy {
X
Xiaofan 已提交
407
		err := fmt.Errorf("query node %d is not ready", Params.QueryNodeCfg.GetNodeID())
408 409 410 411
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
G
godchen 已提交
412
		return status, nil
413
	}
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
	dct := &releasePartitionsTask{
		baseTask: baseTask{
			ctx:  ctx,
			done: make(chan error),
		},
		req:  in,
		node: node,
	}

	err := node.scheduler.queue.Enqueue(dct)
	if err != nil {
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
X
Xiaofan 已提交
429
		log.Warn(err.Error())
G
godchen 已提交
430
		return status, nil
431
	}
X
Xiaofan 已提交
432
	log.Info("releasePartitionsTask Enqueue done", zap.Int64("collectionID", in.CollectionID), zap.Int64s("partitionIDs", in.PartitionIDs))
433

434
	func() {
435 436
		err = dct.WaitToFinish()
		if err != nil {
X
Xiaofan 已提交
437
			log.Warn(err.Error())
438
			return
439
		}
X
Xiaofan 已提交
440
		log.Info("releasePartitionsTask WaitToFinish done", zap.Int64("collectionID", in.CollectionID), zap.Int64s("partitionIDs", in.PartitionIDs))
441
	}()
442 443 444 445 446 447 448

	status := &commonpb.Status{
		ErrorCode: commonpb.ErrorCode_Success,
	}
	return status, nil
}

449
// ReleaseSegments remove the specified segments from query node according segmentIDs, partitionIDs, and collectionID
450
func (node *QueryNode) ReleaseSegments(ctx context.Context, in *queryPb.ReleaseSegmentsRequest) (*commonpb.Status, error) {
451 452
	code := node.stateCode.Load().(internalpb.StateCode)
	if code != internalpb.StateCode_Healthy {
X
Xiaofan 已提交
453
		err := fmt.Errorf("query node %d is not ready", Params.QueryNodeCfg.GetNodeID())
454 455 456 457
		status := &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}
G
godchen 已提交
458
		return status, nil
459
	}
460 461 462
	status := &commonpb.Status{
		ErrorCode: commonpb.ErrorCode_Success,
	}
463
	// collection lock is not needed since we guarantee not query/search will be dispatch from leader
464
	for _, id := range in.SegmentIDs {
465
		err := node.historical.removeSegment(id)
466 467 468 469 470
		if err != nil {
			// not return, try to release all segments
			status.ErrorCode = commonpb.ErrorCode_UnexpectedError
			status.Reason = err.Error()
		}
471
		err = node.streaming.removeSegment(id)
472
		if err != nil {
473 474
			// not return, try to release all segments
			status.ErrorCode = commonpb.ErrorCode_UnexpectedError
475
			status.Reason = err.Error()
476 477
		}
	}
X
xige-16 已提交
478

X
Xiaofan 已提交
479
	log.Info("release segments done", zap.Int64("collectionID", in.CollectionID), zap.Int64s("segmentIDs", in.SegmentIDs))
480 481 482
	return status, nil
}

483
// GetSegmentInfo returns segment information of the collection on the queryNode, and the information includes memSize, numRow, indexName, indexID ...
484
func (node *QueryNode) GetSegmentInfo(ctx context.Context, in *queryPb.GetSegmentInfoRequest) (*queryPb.GetSegmentInfoResponse, error) {
485 486
	code := node.stateCode.Load().(internalpb.StateCode)
	if code != internalpb.StateCode_Healthy {
X
Xiaofan 已提交
487
		err := fmt.Errorf("query node %d is not ready", Params.QueryNodeCfg.GetNodeID())
488 489 490 491 492 493
		res := &queryPb.GetSegmentInfoResponse{
			Status: &commonpb.Status{
				ErrorCode: commonpb.ErrorCode_UnexpectedError,
				Reason:    err.Error(),
			},
		}
G
godchen 已提交
494
		return res, nil
495
	}
496 497 498 499 500 501
	var segmentInfos []*queryPb.SegmentInfo

	segmentIDs := make(map[int64]struct{})
	for _, segmentID := range in.GetSegmentIDs() {
		segmentIDs[segmentID] = struct{}{}
	}
502

503
	// get info from historical
504
	historicalSegmentInfos, err := node.historical.getSegmentInfosByColID(in.CollectionID)
505
	if err != nil {
X
Xiaofan 已提交
506
		log.Warn("GetSegmentInfo: get historical segmentInfo failed", zap.Int64("collectionID", in.CollectionID), zap.Error(err))
507 508 509 510 511 512
		res := &queryPb.GetSegmentInfoResponse{
			Status: &commonpb.Status{
				ErrorCode: commonpb.ErrorCode_UnexpectedError,
				Reason:    err.Error(),
			},
		}
G
godchen 已提交
513
		return res, nil
514
	}
515
	segmentInfos = append(segmentInfos, filterSegmentInfo(historicalSegmentInfos, segmentIDs)...)
516

517
	// get info from streaming
518
	streamingSegmentInfos, err := node.streaming.getSegmentInfosByColID(in.CollectionID)
519
	if err != nil {
X
Xiaofan 已提交
520
		log.Warn("GetSegmentInfo: get streaming segmentInfo failed", zap.Int64("collectionID", in.CollectionID), zap.Error(err))
521 522 523 524 525 526
		res := &queryPb.GetSegmentInfoResponse{
			Status: &commonpb.Status{
				ErrorCode: commonpb.ErrorCode_UnexpectedError,
				Reason:    err.Error(),
			},
		}
G
godchen 已提交
527
		return res, nil
528
	}
529
	segmentInfos = append(segmentInfos, filterSegmentInfo(streamingSegmentInfos, segmentIDs)...)
530

531 532 533 534
	return &queryPb.GetSegmentInfoResponse{
		Status: &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_Success,
		},
535
		Infos: segmentInfos,
536 537
	}, nil
}
538

539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554
// filterSegmentInfo returns segment info which segment id in segmentIDs map
func filterSegmentInfo(segmentInfos []*queryPb.SegmentInfo, segmentIDs map[int64]struct{}) []*queryPb.SegmentInfo {
	if len(segmentIDs) == 0 {
		return segmentInfos
	}
	filtered := make([]*queryPb.SegmentInfo, 0, len(segmentIDs))
	for _, info := range segmentInfos {
		_, ok := segmentIDs[info.GetSegmentID()]
		if !ok {
			continue
		}
		filtered = append(filtered, info)
	}
	return filtered
}

555
// isHealthy checks if QueryNode is healthy
556 557 558 559 560
func (node *QueryNode) isHealthy() bool {
	code := node.stateCode.Load().(internalpb.StateCode)
	return code == internalpb.StateCode_Healthy
}

561
// Search performs replica search tasks.
562
func (node *QueryNode) Search(ctx context.Context, req *queryPb.SearchRequest) (*internalpb.SearchResults, error) {
563 564 565 566 567
	failRet := &internalpb.SearchResults{
		Status: &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
		},
	}
568
	if !node.isHealthy() {
569 570
		failRet.Status.Reason = msgQueryNodeIsUnhealthy(Params.QueryNodeCfg.GetNodeID())
		return failRet, nil
571 572 573 574 575
	}

	log.Debug("Received SearchRequest", zap.String("vchannel", req.GetDmlChannel()), zap.Int64s("segmentIDs", req.GetSegmentIDs()))

	if node.queryShardService == nil {
576 577
		failRet.Status.Reason = "queryShardService is nil"
		return failRet, nil
578 579
	}

580 581 582 583 584 585 586 587
	if !node.queryShardService.hasQueryShard(req.GetDmlChannel()) {
		// TODO: add replicaID in request or remove it in query shard
		err := node.queryShardService.addQueryShard(req.Req.CollectionID, req.GetDmlChannel(), 0)
		if err != nil {
			failRet.Status.Reason = err.Error()
			return failRet, nil
		}
	}
588 589
	qs, err := node.queryShardService.getQueryShard(req.GetDmlChannel())
	if err != nil {
590
		log.Warn("Search failed, failed to get query shard", zap.String("dml channel", req.GetDmlChannel()), zap.Error(err))
591 592 593
		failRet.Status.ErrorCode = commonpb.ErrorCode_NotShardLeader
		failRet.Status.Reason = err.Error()
		return failRet, nil
594 595
	}

596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687
	if req.FromShardLeader {
		historicalTask, err2 := newSearchTask(ctx, req)
		if err2 != nil {
			failRet.Status.Reason = err2.Error()
			return failRet, nil
		}
		historicalTask.QS = qs
		historicalTask.DataScope = querypb.DataScope_Historical
		err2 = node.scheduler.AddReadTask(ctx, historicalTask)
		if err2 != nil {
			failRet.Status.Reason = err2.Error()
			return failRet, nil
		}

		err2 = historicalTask.WaitToFinish()
		if err2 != nil {
			failRet.Status.Reason = err2.Error()
			return failRet, nil
		}
		return historicalTask.Ret, nil
	}

	//from Proxy
	cluster, ok := qs.clusterService.getShardCluster(req.GetDmlChannel())
	if !ok {
		failRet.Status.Reason = fmt.Sprintf("channel %s leader is not here", req.GetDmlChannel())
		return failRet, nil
	}

	searchCtx, cancel := context.WithCancel(ctx)
	defer cancel()

	var results []*internalpb.SearchResults
	var streamingResult *internalpb.SearchResults

	var wg sync.WaitGroup
	var errCluster error

	wg.Add(1) // search cluster
	go func() {
		defer wg.Done()
		// shard leader dispatches request to its shard cluster
		oResults, cErr := cluster.Search(searchCtx, req)
		if cErr != nil {
			log.Warn("search cluster failed", zap.Int64("collectionID", req.Req.GetCollectionID()), zap.Error(cErr))
			cancel()
			errCluster = cErr
			return
		}
		results = oResults
	}()

	var errStreaming error
	wg.Add(1) // search streaming
	go func() {
		defer func() {
			if errStreaming != nil {
				cancel()
			}
		}()

		defer wg.Done()
		streamingTask, err2 := newSearchTask(searchCtx, req)
		if err2 != nil {
			errStreaming = err2
		}
		streamingTask.QS = qs
		streamingTask.DataScope = querypb.DataScope_Streaming
		err2 = node.scheduler.AddReadTask(searchCtx, streamingTask)
		if err2 != nil {
			errStreaming = err2
			return
		}
		err2 = streamingTask.WaitToFinish()
		if err2 != nil {
			errStreaming = err2
			return
		}
		streamingResult = streamingTask.Ret
	}()
	wg.Wait()

	var mainErr error
	if errCluster != nil {
		mainErr = errCluster
		if errors.Is(errCluster, context.Canceled) {
			if errStreaming != nil {
				mainErr = errStreaming
			}
		}
	} else if errStreaming != nil {
		mainErr = errStreaming
688 689
	}

690 691 692 693 694 695 696 697 698 699 700
	if mainErr != nil {
		failRet.Status.Reason = mainErr.Error()
		return failRet, nil
	}
	results = append(results, streamingResult)
	ret, err2 := reduceSearchResults(results, req.Req.GetNq(), req.Req.GetTopk(), req.Req.GetMetricType())
	if err2 != nil {
		failRet.Status.Reason = err2.Error()
		return failRet, nil
	}
	return ret, nil
701 702 703
}

// Query performs replica query tasks.
704
func (node *QueryNode) Query(ctx context.Context, req *queryPb.QueryRequest) (*internalpb.RetrieveResults, error) {
705 706 707 708 709
	failRet := &internalpb.RetrieveResults{
		Status: &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
		},
	}
710
	if !node.isHealthy() {
711 712
		failRet.Status.Reason = msgQueryNodeIsUnhealthy(Params.QueryNodeCfg.GetNodeID())
		return failRet, nil
713 714 715 716
	}
	log.Debug("Received QueryRequest", zap.String("vchannel", req.GetDmlChannel()), zap.Int64s("segmentIDs", req.GetSegmentIDs()))

	if node.queryShardService == nil {
717 718 719 720 721 722 723 724
		failRet.Status.Reason = "queryShardService is nil"
		return failRet, nil
	}

	if !node.queryShardService.hasQueryShard(req.GetDmlChannel()) {
		err := node.queryShardService.addQueryShard(req.Req.CollectionID, req.GetDmlChannel(), 0) // TODO: add replicaID in request or remove it in query shard
		failRet.Status.Reason = err.Error()
		return failRet, nil
725 726 727 728
	}

	qs, err := node.queryShardService.getQueryShard(req.GetDmlChannel())
	if err != nil {
729
		log.Warn("Query failed, failed to get query shard", zap.String("dml channel", req.GetDmlChannel()), zap.Error(err))
730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750
		failRet.Status.Reason = err.Error()
		return failRet, nil
	}

	if req.FromShardLeader {
		// construct a queryTask
		queryTask := newQueryTask(ctx, req)
		queryTask.QS = qs
		queryTask.DataScope = querypb.DataScope_Historical
		err2 := node.scheduler.AddReadTask(ctx, queryTask)
		if err2 != nil {
			failRet.Status.Reason = err2.Error()
			return failRet, nil
		}

		err2 = queryTask.WaitToFinish()
		if err2 != nil {
			failRet.Status.Reason = err2.Error()
			return failRet, nil
		}
		return queryTask.Ret, nil
751 752
	}

753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817
	cluster, ok := qs.clusterService.getShardCluster(req.GetDmlChannel())
	if !ok {
		failRet.Status.Reason = fmt.Sprintf("channel %s leader is not here", req.GetDmlChannel())
		return failRet, nil
	}

	// add cancel when error occurs
	queryCtx, cancel := context.WithCancel(ctx)
	defer cancel()

	var results []*internalpb.RetrieveResults
	var streamingResult *internalpb.RetrieveResults
	var wg sync.WaitGroup

	var errCluster error
	wg.Add(1)
	go func() {
		defer wg.Done()
		// shard leader dispatches request to its shard cluster
		oResults, cErr := cluster.Query(queryCtx, req)
		if cErr != nil {
			log.Warn("failed to query cluster", zap.Int64("collectionID", req.Req.GetCollectionID()), zap.Error(cErr))
			log.Info("czs_query_cluster_cancel", zap.Error(cErr))
			errCluster = cErr
			cancel()
			return
		}
		results = oResults
	}()

	var errStreaming error
	wg.Add(1)
	go func() {
		defer wg.Done()
		streamingTask := newQueryTask(queryCtx, req)
		streamingTask.DataScope = querypb.DataScope_Streaming
		streamingTask.QS = qs
		err2 := node.scheduler.AddReadTask(queryCtx, streamingTask)
		defer func() {
			errStreaming = err2
			if err2 != nil {
				cancel()
			}
		}()
		if err2 != nil {
			return
		}
		err2 = streamingTask.WaitToFinish()
		if err2 != nil {
			return
		}
		streamingResult = streamingTask.Ret
	}()
	wg.Wait()

	var mainErr error
	if errCluster != nil {
		mainErr = errCluster
		if errors.Is(errCluster, context.Canceled) {
			if errStreaming != nil {
				mainErr = errStreaming
			}
		}
	} else if errStreaming != nil {
		mainErr = errStreaming
818 819
	}

820 821 822 823 824 825 826 827 828 829 830
	if mainErr != nil {
		failRet.Status.Reason = mainErr.Error()
		return failRet, nil
	}
	results = append(results, streamingResult)
	ret, err2 := mergeInternalRetrieveResults(results)
	if err2 != nil {
		failRet.Status.Reason = err2.Error()
		return failRet, nil
	}
	return ret, nil
831 832
}

833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857
// SyncReplicaSegments syncs replica node & segments states
func (node *QueryNode) SyncReplicaSegments(ctx context.Context, req *querypb.SyncReplicaSegmentsRequest) (*commonpb.Status, error) {
	if !node.isHealthy() {
		return &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    msgQueryNodeIsUnhealthy(Params.QueryNodeCfg.GetNodeID()),
		}, nil
	}

	log.Debug("Received SyncReplicaSegments request", zap.String("vchannelName", req.GetVchannelName()))

	err := node.ShardClusterService.SyncReplicaSegments(req.GetVchannelName(), req.GetReplicaSegments())
	if err != nil {
		log.Warn("failed to sync replica semgents,", zap.String("vchannel", req.GetVchannelName()), zap.Error(err))
		return &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    err.Error(),
		}, nil
	}

	log.Debug("SyncReplicaSegments Done", zap.String("vchannel", req.GetVchannelName()))

	return &commonpb.Status{ErrorCode: commonpb.ErrorCode_Success}, nil
}

G
godchen 已提交
858
// GetMetrics return system infos of the query node, such as total memory, memory usage, cpu usage ...
859
// TODO(dragondriver): cache the Metrics and set a retention to the cache
860 861 862
func (node *QueryNode) GetMetrics(ctx context.Context, req *milvuspb.GetMetricsRequest) (*milvuspb.GetMetricsResponse, error) {
	if !node.isHealthy() {
		log.Warn("QueryNode.GetMetrics failed",
X
Xiaofan 已提交
863
			zap.Int64("node_id", Params.QueryNodeCfg.GetNodeID()),
864
			zap.String("req", req.Request),
X
Xiaofan 已提交
865
			zap.Error(errQueryNodeIsUnhealthy(Params.QueryNodeCfg.GetNodeID())))
866 867 868 869

		return &milvuspb.GetMetricsResponse{
			Status: &commonpb.Status{
				ErrorCode: commonpb.ErrorCode_UnexpectedError,
X
Xiaofan 已提交
870
				Reason:    msgQueryNodeIsUnhealthy(Params.QueryNodeCfg.GetNodeID()),
871 872 873 874 875 876 877 878
			},
			Response: "",
		}, nil
	}

	metricType, err := metricsinfo.ParseMetricType(req.Request)
	if err != nil {
		log.Warn("QueryNode.GetMetrics failed to parse metric type",
X
Xiaofan 已提交
879
			zap.Int64("node_id", Params.QueryNodeCfg.GetNodeID()),
880 881 882 883 884 885 886 887 888 889 890 891 892 893
			zap.String("req", req.Request),
			zap.Error(err))

		return &milvuspb.GetMetricsResponse{
			Status: &commonpb.Status{
				ErrorCode: commonpb.ErrorCode_UnexpectedError,
				Reason:    err.Error(),
			},
			Response: "",
		}, nil
	}

	if metricType == metricsinfo.SystemInfoMetrics {
		metrics, err := getSystemInfoMetrics(ctx, req, node)
X
Xiaofan 已提交
894 895
		if err != nil {
			log.Warn("QueryNode.GetMetrics failed",
X
Xiaofan 已提交
896
				zap.Int64("node_id", Params.QueryNodeCfg.GetNodeID()),
X
Xiaofan 已提交
897 898 899 900
				zap.String("req", req.Request),
				zap.String("metric_type", metricType),
				zap.Error(err))
		}
901

G
godchen 已提交
902
		return metrics, nil
903 904 905
	}

	log.Debug("QueryNode.GetMetrics failed, request metric type is not implemented yet",
X
Xiaofan 已提交
906
		zap.Int64("node_id", Params.QueryNodeCfg.GetNodeID()),
907 908 909 910 911 912 913 914 915 916 917
		zap.String("req", req.Request),
		zap.String("metric_type", metricType))

	return &milvuspb.GetMetricsResponse{
		Status: &commonpb.Status{
			ErrorCode: commonpb.ErrorCode_UnexpectedError,
			Reason:    metricsinfo.MsgUnimplementedMetric,
		},
		Response: "",
	}, nil
}