From 662320c07d003ce6d04d6a9ef73ca87fffd5bd12 Mon Sep 17 00:00:00 2001 From: wxyu Date: Sat, 24 Aug 2019 11:24:36 +0800 Subject: [PATCH] MS-412 Fix gpu cache logical error Former-commit-id: 6bd2a056feee54393fa4bc16b1b233f54dac0500 --- cpp/CHANGELOG.md | 3 ++- cpp/conf/server_config.template | 10 +++---- cpp/src/db/engine/ExecutionEngineImpl.cpp | 20 ++++++++------ .../scheduler/action/PushTaskToNeighbour.cpp | 26 ++++++++++++++++--- 4 files changed, 42 insertions(+), 17 deletions(-) diff --git a/cpp/CHANGELOG.md b/cpp/CHANGELOG.md index 812b0907..671d865a 100644 --- a/cpp/CHANGELOG.md +++ b/cpp/CHANGELOG.md @@ -5,6 +5,8 @@ Please mark all change in change log and use the ticket from JIRA. # Milvus 0.4.0 (2019-07-28) ## Bug +- MS-411 - Fix metric unittest linking error +- MS-412 - Fix gpu cache logical error ## Improvement - MS-327 - Clean code for milvus @@ -80,7 +82,6 @@ Please mark all change in change log and use the ticket from JIRA. - MS-330 - Stability test failed caused by server core dumped - MS-347 - Build index hangs again - MS-382 - fix MySQLMetaImpl::CleanUpFilesWithTTL unknown column bug -- MS-411 - Fix metric unittest linking error ## Improvement - MS-156 - Add unittest for merge result functions diff --git a/cpp/conf/server_config.template b/cpp/conf/server_config.template index d895dbbf..0a363eff 100644 --- a/cpp/conf/server_config.template +++ b/cpp/conf/server_config.template @@ -65,21 +65,21 @@ resource_config: memory: 64 device_id: 0 enable_loader: true - enable_executor: true + enable_executor: false gtx1060: type: GPU memory: 6 device_id: 0 - enable_loader: false - enable_executor: false + enable_loader: true + enable_executor: true gtx1660: type: GPU memory: 6 device_id: 1 - enable_loader: false - enable_executor: false + enable_loader: true + enable_executor: true # connection list, length: 0~N # format: -${resource_name}===${resource_name} diff --git a/cpp/src/db/engine/ExecutionEngineImpl.cpp b/cpp/src/db/engine/ExecutionEngineImpl.cpp index 5ef2fc0c..7489e784 100644 --- a/cpp/src/db/engine/ExecutionEngineImpl.cpp +++ b/cpp/src/db/engine/ExecutionEngineImpl.cpp @@ -139,9 +139,11 @@ Status ExecutionEngineImpl::Load(bool to_cache) { } Status ExecutionEngineImpl::CopyToGpu(uint64_t device_id) { - index_ = zilliz::milvus::cache::GpuCacheMgr::GetInstance(device_id)->GetIndex(location_); - bool already_in_cache = (index_ != nullptr); - if (!index_) { + auto index = zilliz::milvus::cache::GpuCacheMgr::GetInstance(device_id)->GetIndex(location_); + bool already_in_cache = (index != nullptr); + if (already_in_cache) { + index_ = index; + } else { try { index_ = index_->CopyToGpu(device_id); ENGINE_LOG_DEBUG << "CPU to GPU" << device_id; @@ -161,9 +163,11 @@ Status ExecutionEngineImpl::CopyToGpu(uint64_t device_id) { } Status ExecutionEngineImpl::CopyToCpu() { - index_ = zilliz::milvus::cache::CpuCacheMgr::GetInstance()->GetIndex(location_); - bool already_in_cache = (index_ != nullptr); - if (!index_) { + auto index = zilliz::milvus::cache::CpuCacheMgr::GetInstance()->GetIndex(location_); + bool already_in_cache = (index != nullptr); + if (already_in_cache) { + index_ = index; + } else { try { index_ = index_->CopyToCpu(); ENGINE_LOG_DEBUG << "GPU to CPU"; @@ -175,7 +179,7 @@ Status ExecutionEngineImpl::CopyToCpu() { } } - if(!already_in_cache) { + if (!already_in_cache) { Cache(); } return Status::OK(); @@ -276,7 +280,7 @@ Status ExecutionEngineImpl::Init() { using namespace zilliz::milvus::server; ServerConfig &config = ServerConfig::GetInstance(); ConfigNode server_config = config.GetConfig(CONFIG_SERVER); - gpu_num_ = server_config.GetInt32Value("gpu_index", 0); + gpu_num_ = server_config.GetInt32Value("gpu_index", 0); return Status::OK(); } diff --git a/cpp/src/scheduler/action/PushTaskToNeighbour.cpp b/cpp/src/scheduler/action/PushTaskToNeighbour.cpp index 7c5855c1..9afeac68 100644 --- a/cpp/src/scheduler/action/PushTaskToNeighbour.cpp +++ b/cpp/src/scheduler/action/PushTaskToNeighbour.cpp @@ -5,6 +5,7 @@ ******************************************************************************/ #include +#include #include "Action.h" @@ -38,6 +39,22 @@ push_task_round_robin(TaskTable &self_task_table, std::list &neighb } } +void +push_task_randomly(TaskTable &self_task_table, std::vector &neighbours) { + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_int_distribution dist(0, neighbours.size() - 1); + CacheMgr cache; + + auto indexes = PickToMove(self_task_table, cache, self_task_table.Size()); + for (auto index : indexes) { + if (self_task_table.Move(index)) { + auto task = self_task_table.Get(index)->task; + neighbours[dist(mt)]->task_table().Put(task); + } + } +} + void Action::PushTaskToNeighbour(const ResourceWPtr &res) { auto self = res.lock(); @@ -60,18 +77,21 @@ Action::PushTaskToNeighbourHasExecutor(const ResourceWPtr &res) { auto self = res.lock(); if (not self) return; - std::list neighbours; + std::list l_neighbours; + std::vector v_neighbours; for (auto &neighbour_node : self->GetNeighbours()) { auto node = neighbour_node.neighbour_node.lock(); if (not node) continue; auto resource = std::static_pointer_cast(node); if (resource->HasExecutor()) { - neighbours.emplace_back(resource); + l_neighbours.push_back(resource); + v_neighbours.push_back(resource); } } - push_task_round_robin(self->task_table(), neighbours); +// push_task_round_robin(self->task_table(), l_neighbours); + push_task_randomly(self->task_table(), v_neighbours); } -- GitLab