提交 fddfd1eb 编写于 作者: Y yudong.cai

#346 update config APIs to support build index with multiple GPUs

上级 a47b7284
......@@ -27,7 +27,6 @@ metric_config:
port: 8080 # port prometheus uses to fetch metrics, must in range [1025, 65534]
cache_config:
cpu_cache_capacity: 16 # GB, CPU memory used for cache, must be a positive integer
cpu_cache_threshold: 0.85 # percentage of data that will be kept when cache cleanup is triggered, must be in range (0.0, 1.0]
cache_insert_data: false # whether to load inserted data into cache, must be a boolean
......@@ -38,6 +37,7 @@ engine_config:
gpu_search_threshold: 1000 # threshold beyond which the search computation is executed on GPUs only
resource_config:
search_resources: # define the device used for search computation
search_resources: # define the devices used for search computation, must be in format: cpu or gpux
- cpu
index_build_resources: # define the devices used for index building, must be in format: cpu or gpux
- cpu
index_build_device: cpu # CPU used for building index
......@@ -42,4 +42,5 @@ resource_config:
search_resources: # define the devices used for search computation, must be in format: cpu or gpux
- cpu
- gpu0
index_build_device: gpu0 # CPU / GPU used for building index, must be in format: cpu or gpux
index_build_resources: # define the devices used for index building, must be in format: cpu or gpux
- gpu0
\ No newline at end of file
......@@ -215,8 +215,8 @@ Config::ValidateConfig() {
return s;
}
int32_t resource_index_build_device;
s = GetResourceConfigIndexBuildDevice(resource_index_build_device);
std::vector<std::string> index_build_resources;
s = GetResourceConfigIndexBuildResources(index_build_resources);
if (!s.ok()) {
return s;
}
......@@ -351,7 +351,7 @@ Config::ResetDefaultConfig() {
return s;
}
s = SetResourceConfigIndexBuildDevice(CONFIG_RESOURCE_INDEX_BUILD_DEVICE_DEFAULT);
s = SetResourceConfigIndexBuildResources(CONFIG_RESOURCE_INDEX_BUILD_RESOURCES_DEFAULT);
if (!s.ok()) {
return s;
}
......@@ -599,22 +599,28 @@ Config::CheckCacheConfigGpuCacheCapacity(const std::string& value) {
return Status(SERVER_INVALID_ARGUMENT, msg);
} else {
uint64_t gpu_cache_capacity = std::stoi(value) * GB;
int device_id;
Status s = GetResourceConfigIndexBuildDevice(device_id);
std::vector<std::string> resources;
Status s = GetResourceConfigIndexBuildResources(resources);
if (!s.ok()) {
return s;
}
size_t gpu_memory;
if (!ValidationUtil::GetGpuMemory(device_id, gpu_memory).ok()) {
std::string msg = "Fail to get GPU memory for GPU device: " + std::to_string(device_id);
return Status(SERVER_UNEXPECTED_ERROR, msg);
} else if (gpu_cache_capacity >= gpu_memory) {
std::string msg = "Invalid gpu cache capacity: " + value +
". Possible reason: cache_config.gpu_cache_capacity exceeds GPU memory.";
return Status(SERVER_INVALID_ARGUMENT, msg);
} else if (gpu_cache_capacity > (double)gpu_memory * 0.9) {
std::cerr << "Warning: gpu cache capacity value is too big" << std::endl;
for (auto& resource : resources) {
if (resource == "cpu") {
continue;
}
int32_t device_id = std::stoi(resource.substr(3));
if (!ValidationUtil::GetGpuMemory(device_id, gpu_memory).ok()) {
std::string msg = "Fail to get GPU memory for GPU device: " + std::to_string(device_id);
return Status(SERVER_UNEXPECTED_ERROR, msg);
} else if (gpu_cache_capacity >= gpu_memory) {
std::string msg = "Invalid gpu cache capacity: " + value +
". Possible reason: cache_config.gpu_cache_capacity exceeds GPU memory.";
return Status(SERVER_INVALID_ARGUMENT, msg);
} else if (gpu_cache_capacity > (double) gpu_memory * 0.9) {
std::cerr << "Warning: gpu cache capacity value is too big" << std::endl;
}
}
}
return Status::OK();
......@@ -745,10 +751,18 @@ Config::CheckResourceConfigSearchResources(const std::vector<std::string>& value
}
Status
Config::CheckResourceConfigIndexBuildDevice(const std::string& value) {
auto status = CheckResource(value);
if (!status.ok()) {
return Status(SERVER_INVALID_ARGUMENT, status.message());
Config::CheckResourceConfigIndexBuildResources(const std::vector<std::string>& value) {
if (value.empty()) {
std::string msg =
"Invalid build index resource. "
"Possible reason: resource_config.build_index_resources is empty.";
return Status(SERVER_INVALID_ARGUMENT, msg);
}
for (auto& resource : value) {
auto status = CheckResource(resource);
if (!status.ok()) {
return Status(SERVER_INVALID_ARGUMENT, status.message());
}
}
return Status::OK();
}
......@@ -1030,27 +1044,18 @@ Status
Config::GetResourceConfigSearchResources(std::vector<std::string>& value) {
std::string str =
GetConfigSequenceStr(CONFIG_RESOURCE, CONFIG_RESOURCE_SEARCH_RESOURCES,
CONFIG_RESOURCE_SEARCH_RESOURCES_DELIMITER, CONFIG_RESOURCE_SEARCH_RESOURCES_DEFAULT);
server::StringHelpFunctions::SplitStringByDelimeter(str, CONFIG_RESOURCE_SEARCH_RESOURCES_DELIMITER, value);
CONFIG_RESOURCE_RESOURCES_DELIMITER, CONFIG_RESOURCE_SEARCH_RESOURCES_DEFAULT);
server::StringHelpFunctions::SplitStringByDelimeter(str, CONFIG_RESOURCE_RESOURCES_DELIMITER, value);
return CheckResourceConfigSearchResources(value);
}
Status
Config::GetResourceConfigIndexBuildDevice(int32_t& value) {
Config::GetResourceConfigIndexBuildResources(std::vector<std::string>& value) {
std::string str =
GetConfigStr(CONFIG_RESOURCE, CONFIG_RESOURCE_INDEX_BUILD_DEVICE, CONFIG_RESOURCE_INDEX_BUILD_DEVICE_DEFAULT);
Status s = CheckResourceConfigIndexBuildDevice(str);
if (!s.ok()) {
return s;
}
if (str == "cpu") {
value = CPU_DEVICE_ID;
} else {
value = std::stoi(str.substr(3));
}
return Status::OK();
GetConfigSequenceStr(CONFIG_RESOURCE, CONFIG_RESOURCE_INDEX_BUILD_RESOURCES,
CONFIG_RESOURCE_RESOURCES_DELIMITER, CONFIG_RESOURCE_INDEX_BUILD_RESOURCES_DEFAULT);
server::StringHelpFunctions::SplitStringByDelimeter(str, CONFIG_RESOURCE_RESOURCES_DELIMITER, value);
return CheckResourceConfigIndexBuildResources(value);
}
///////////////////////////////////////////////////////////////////////////////
......@@ -1305,7 +1310,7 @@ Config::SetResourceConfigMode(const std::string& value) {
Status
Config::SetResourceConfigSearchResources(const std::string& value) {
std::vector<std::string> res_vec;
server::StringHelpFunctions::SplitStringByDelimeter(value, CONFIG_RESOURCE_SEARCH_RESOURCES_DELIMITER, res_vec);
server::StringHelpFunctions::SplitStringByDelimeter(value, CONFIG_RESOURCE_RESOURCES_DELIMITER, res_vec);
Status s = CheckResourceConfigSearchResources(res_vec);
if (!s.ok()) {
......@@ -1317,13 +1322,16 @@ Config::SetResourceConfigSearchResources(const std::string& value) {
}
Status
Config::SetResourceConfigIndexBuildDevice(const std::string& value) {
Status s = CheckResourceConfigIndexBuildDevice(value);
Config::SetResourceConfigIndexBuildResources(const std::string &value) {
std::vector<std::string> res_vec;
server::StringHelpFunctions::SplitStringByDelimeter(value, CONFIG_RESOURCE_RESOURCES_DELIMITER, res_vec);
Status s = CheckResourceConfigIndexBuildResources(res_vec);
if (!s.ok()) {
return s;
}
SetConfigValueInMem(CONFIG_RESOURCE, CONFIG_RESOURCE_INDEX_BUILD_DEVICE, value);
SetConfigValueInMem(CONFIG_RESOURCE, CONFIG_RESOURCE_INDEX_BUILD_RESOURCES, value);
return Status::OK();
}
......
......@@ -91,20 +91,18 @@ static const char* CONFIG_ENGINE_GPU_SEARCH_THRESHOLD_DEFAULT = "1000";
static const char* CONFIG_RESOURCE = "resource_config";
static const char* CONFIG_RESOURCE_MODE = "mode";
static const char* CONFIG_RESOURCE_MODE_DEFAULT = "simple";
static const char* CONFIG_RESOURCE_RESOURCES_DELIMITER = ",";
static const char* CONFIG_RESOURCE_SEARCH_RESOURCES = "search_resources";
static const char* CONFIG_RESOURCE_SEARCH_RESOURCES_DELIMITER = ",";
#ifdef MILVUS_CPU_VERSION
static const char* CONFIG_RESOURCE_SEARCH_RESOURCES_DEFAULT = "cpu";
#else
static const char* CONFIG_RESOURCE_SEARCH_RESOURCES_DEFAULT = "cpu,gpu0";
#endif
static const char* CONFIG_RESOURCE_INDEX_BUILD_DEVICE = "index_build_device";
static const char* CONFIG_RESOURCE_INDEX_BUILD_RESOURCES = "index_build_resources";
#ifdef MILVUS_CPU_VERSION
static const char* CONFIG_RESOURCE_INDEX_BUILD_DEVICE_DEFAULT = "cpu";
static const char* CONFIG_RESOURCE_INDEX_BUILD_RESOURCES_DEFAULT = "cpu";
#else
static const char* CONFIG_RESOURCE_INDEX_BUILD_DEVICE_DEFAULT = "gpu0";
static const char* CONFIG_RESOURCE_INDEX_BUILD_RESOURCES_DEFAULT = "gpu0";
#endif
const int32_t CPU_DEVICE_ID = -1;
......@@ -190,7 +188,7 @@ class Config {
Status
CheckResourceConfigSearchResources(const std::vector<std::string>& value);
Status
CheckResourceConfigIndexBuildDevice(const std::string& value);
CheckResourceConfigIndexBuildResources(const std::vector<std::string>& value);
std::string
GetConfigStr(const std::string& parent_key, const std::string& child_key, const std::string& default_value = "");
......@@ -259,7 +257,7 @@ class Config {
Status
GetResourceConfigSearchResources(std::vector<std::string>& value);
Status
GetResourceConfigIndexBuildDevice(int32_t& value);
GetResourceConfigIndexBuildResources(std::vector<std::string>& value);
public:
/* server config */
......@@ -320,7 +318,7 @@ class Config {
Status
SetResourceConfigSearchResources(const std::string& value);
Status
SetResourceConfigIndexBuildDevice(const std::string& value);
SetResourceConfigIndexBuildResources(const std::string& value);
private:
std::unordered_map<std::string, std::unordered_map<std::string, std::string>> config_map_;
......
......@@ -182,7 +182,7 @@ ValidationUtil::ValidatePartitionTags(const std::vector<std::string>& partition_
}
Status
ValidationUtil::ValidateGpuIndex(uint32_t gpu_index) {
ValidationUtil::ValidateGpuIndex(int32_t gpu_index) {
#ifdef MILVUS_GPU_VERSION
int num_devices = 0;
auto cuda_err = cudaGetDeviceCount(&num_devices);
......@@ -203,7 +203,7 @@ ValidationUtil::ValidateGpuIndex(uint32_t gpu_index) {
}
Status
ValidationUtil::GetGpuMemory(uint32_t gpu_index, size_t& memory) {
ValidationUtil::GetGpuMemory(int32_t gpu_index, size_t& memory) {
#ifdef MILVUS_GPU_VERSION
cudaDeviceProp deviceProp;
......
......@@ -59,10 +59,10 @@ class ValidationUtil {
ValidatePartitionTags(const std::vector<std::string>& partition_tags);
static Status
ValidateGpuIndex(uint32_t gpu_index);
ValidateGpuIndex(int32_t gpu_index);
static Status
GetGpuMemory(uint32_t gpu_index, size_t& memory);
GetGpuMemory(int32_t gpu_index, size_t& memory);
static Status
ValidateIpAddress(const std::string& ip_address);
......
......@@ -272,29 +272,34 @@ TEST_F(ConfigTest, SERVER_CONFIG_VALID_TEST) {
#else
std::vector<std::string> search_resources = {"cpu", "gpu0"};
#endif
std::vector<std::string> res_vec;
std::string res_str;
std::vector<std::string> search_res_vec;
std::string search_res_str;
milvus::server::StringHelpFunctions::MergeStringWithDelimeter(
search_resources, milvus::server::CONFIG_RESOURCE_SEARCH_RESOURCES_DELIMITER, res_str);
s = config.SetResourceConfigSearchResources(res_str);
search_resources, milvus::server::CONFIG_RESOURCE_RESOURCES_DELIMITER, search_res_str);
s = config.SetResourceConfigSearchResources(search_res_str);
ASSERT_TRUE(s.ok());
s = config.GetResourceConfigSearchResources(res_vec);
s = config.GetResourceConfigSearchResources(search_res_vec);
ASSERT_TRUE(s.ok());
for (size_t i = 0; i < search_resources.size(); i++) {
ASSERT_TRUE(search_resources[i] == res_vec[i]);
ASSERT_TRUE(search_resources[i] == search_res_vec[i]);
}
#ifdef MILVUS_CPU_VERSION
int32_t resource_index_build_device = milvus::server::CPU_DEVICE_ID;
s = config.SetResourceConfigIndexBuildDevice("cpu");
std::vector<std::string> index_build_resources = {"cpu"};
#else
int32_t resource_index_build_device = 0;
s = config.SetResourceConfigIndexBuildDevice("gpu" + std::to_string(resource_index_build_device));
std::vector<std::string> index_build_resources = {"gpu0", "gpu1"};
#endif
std::vector<std::string> index_build_res_vec;
std::string index_build_res_str;
milvus::server::StringHelpFunctions::MergeStringWithDelimeter(
index_build_resources, milvus::server::CONFIG_RESOURCE_RESOURCES_DELIMITER, index_build_res_str);
s = config.SetResourceConfigIndexBuildResources(index_build_res_str);
ASSERT_TRUE(s.ok());
s = config.GetResourceConfigIndexBuildDevice(int32_val);
s = config.GetResourceConfigIndexBuildResources(index_build_res_vec);
ASSERT_TRUE(s.ok());
ASSERT_TRUE(int32_val == resource_index_build_device);
for (size_t i = 0; i < index_build_resources.size(); i++) {
ASSERT_TRUE(index_build_resources[i] == index_build_res_vec[i]);
}
}
TEST_F(ConfigTest, SERVER_CONFIG_INVALID_TEST) {
......@@ -418,9 +423,9 @@ TEST_F(ConfigTest, SERVER_CONFIG_INVALID_TEST) {
s = config.SetResourceConfigSearchResources("cpu");
ASSERT_TRUE(s.ok());
s = config.SetResourceConfigIndexBuildDevice("gup2");
s = config.SetResourceConfigIndexBuildResources("gup2");
ASSERT_FALSE(s.ok());
s = config.SetResourceConfigIndexBuildDevice("gpu16");
s = config.SetResourceConfigIndexBuildResources("gpu16");
ASSERT_FALSE(s.ok());
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册