提交 39d63b33 编写于 作者: W wxyu

MS-546 Add simple mode resource_config


Former-commit-id: 25f95d9c8c3adc108c3ec17954f33bbfa9e48d08
上级 5d6bd172
......@@ -115,6 +115,7 @@ Please mark all change in change log and use the ticket from JIRA.
- MS-533 - Update resource_test to cover dump function
- MS-523 - Config file validation
- MS-539 - Remove old task code
- MS-546 - Add simple mode resource_config
## New Feature
- MS-343 - Implement ResourceMgr
......
......@@ -38,44 +38,8 @@ engine_config:
use_blas_threshold: 20
resource_config:
# resource list, length: 0~N
# please set a DISK resource and a CPU resource least, or system will not return query result.
#
# example:
# resource_name: # resource name, just using in connections below
# type: DISK # resource type, optional: DISK/CPU/GPU
# device_id: 0
# enable_executor: false # if is enable executor, optional: true, false
mode: simple
resources:
ssda:
type: DISK
device_id: 0
enable_executor: false
cpu:
type: CPU
device_id: 0
enable_executor: false
gpu0:
type: GPU
device_id: 0
enable_executor: true
gpu_resource_num: 2
pinned_memory: 300
temp_memory: 300
# connection list, length: 0~N
# example:
# connection_name:
# speed: 100 # unit: MS/s
# endpoint: ${resource_name}===${resource_name}
connections:
io:
speed: 500
endpoint: ssda===cpu
pcie0:
speed: 11000
endpoint: cpu===gpu0
- cpu
# - gpu0
# - gpu1
......@@ -8,6 +8,8 @@
#include "server/ServerConfig.h"
#include "ResourceFactory.h"
#include "knowhere/index/vector_index/gpu_ivf.h"
#include "Utils.h"
namespace zilliz {
namespace milvus {
......@@ -20,72 +22,132 @@ SchedulerPtr SchedInst::instance = nullptr;
std::mutex SchedInst::mutex_;
void
StartSchedulerService() {
try {
server::ConfigNode &config = server::ServerConfig::GetInstance().GetConfig(server::CONFIG_RESOURCE);
if (config.GetChildren().empty()) throw "resource_config null exception";
auto resources = config.GetChild(server::CONFIG_RESOURCES).GetChildren();
if (resources.empty()) throw "Children of resource_config null exception";
for (auto &resource : resources) {
auto &resname = resource.first;
auto &resconf = resource.second;
auto type = resconf.GetValue(server::CONFIG_RESOURCE_TYPE);
// auto memory = resconf.GetInt64Value(server::CONFIG_RESOURCE_MEMORY);
auto device_id = resconf.GetInt64Value(server::CONFIG_RESOURCE_DEVICE_ID);
// auto enable_loader = resconf.GetBoolValue(server::CONFIG_RESOURCE_ENABLE_LOADER);
auto enable_loader = true;
auto enable_executor = resconf.GetBoolValue(server::CONFIG_RESOURCE_ENABLE_EXECUTOR);
auto pinned_memory = resconf.GetInt64Value(server::CONFIG_RESOURCE_PIN_MEMORY);
auto temp_memory = resconf.GetInt64Value(server::CONFIG_RESOURCE_TEMP_MEMORY);
auto resource_num = resconf.GetInt64Value(server::CONFIG_RESOURCE_NUM);
auto res = ResMgrInst::GetInstance()->Add(ResourceFactory::Create(resname,
type,
device_id,
enable_loader,
enable_executor));
if (res.lock()->type() == ResourceType::GPU) {
auto pinned_memory = resconf.GetInt64Value(server::CONFIG_RESOURCE_PIN_MEMORY, 300);
auto temp_memory = resconf.GetInt64Value(server::CONFIG_RESOURCE_TEMP_MEMORY, 300);
auto resource_num = resconf.GetInt64Value(server::CONFIG_RESOURCE_NUM, 2);
pinned_memory = 1024 * 1024 * pinned_memory;
temp_memory = 1024 * 1024 * temp_memory;
knowhere::FaissGpuResourceMgr::GetInstance().InitDevice(device_id,
pinned_memory,
temp_memory,
resource_num);
load_simple_config() {
server::ConfigNode &config = server::ServerConfig::GetInstance().GetConfig(server::CONFIG_RESOURCE);
auto mode = config.GetValue("mode", "simple");
auto resources = config.GetSequence("resources");
bool cpu = false;
std::set<uint64_t> gpu_ids;
for (auto &resource : resources) {
if (resource == "cpu") {
cpu = true;
break;
} else {
if (resource.length() < 4 || resource.substr(0, 3) != "gpu") {
// error
exit(-1);
}
auto gpu_id = std::stoi(resource.substr(3));
if (gpu_id >= get_num_gpu()) {
// error
exit(-1);
}
gpu_ids.insert(gpu_id);
}
}
knowhere::FaissGpuResourceMgr::GetInstance().InitResource();
ResMgrInst::GetInstance()->Add(ResourceFactory::Create("disk", "DISK", 0, true, false));
auto io = Connection("io", 500);
if (cpu) {
ResMgrInst::GetInstance()->Add(ResourceFactory::Create("cpu", "CPU", 0, true, true));
ResMgrInst::GetInstance()->Connect("disk", "cpu", io);
} else {
ResMgrInst::GetInstance()->Add(ResourceFactory::Create("cpu", "CPU", 0, true, false));
ResMgrInst::GetInstance()->Connect("disk", "cpu", io);
auto pcie = Connection("pcie", 12000);
for (auto &gpu_id : gpu_ids) {
ResMgrInst::GetInstance()->Add(ResourceFactory::Create(std::to_string(gpu_id), "GPU", gpu_id, true, true));
ResMgrInst::GetInstance()->Connect("cpu", std::to_string(gpu_id), io);
auto pinned_memory = 300;
auto temp_memory = 300;
auto resource_num = 2;
pinned_memory = 1024 * 1024 * pinned_memory;
temp_memory = 1024 * 1024 * temp_memory;
knowhere::FaissGpuResourceMgr::GetInstance().InitDevice(gpu_id,
pinned_memory,
temp_memory,
resource_num);
}
auto connections = config.GetChild(server::CONFIG_RESOURCE_CONNECTIONS).GetChildren();
if(connections.empty()) throw "connections config null exception";
for (auto &conn : connections) {
auto &connect_name = conn.first;
auto &connect_conf = conn.second;
auto connect_speed = connect_conf.GetInt64Value(server::CONFIG_SPEED_CONNECTIONS);
auto connect_endpoint = connect_conf.GetValue(server::CONFIG_ENDPOINT_CONNECTIONS);
knowhere::FaissGpuResourceMgr::GetInstance().InitResource();
}
}
std::string delimiter = "===";
std::string left = connect_endpoint.substr(0, connect_endpoint.find(delimiter));
std::string right = connect_endpoint.substr(connect_endpoint.find(delimiter) + 3,
connect_endpoint.length());
void
load_advance_config() {
// try {
// server::ConfigNode &config = server::ServerConfig::GetInstance().GetConfig(server::CONFIG_RESOURCE);
//
// if (config.GetChildren().empty()) throw "resource_config null exception";
//
// auto resources = config.GetChild(server::CONFIG_RESOURCES).GetChildren();
//
// if (resources.empty()) throw "Children of resource_config null exception";
//
// for (auto &resource : resources) {
// auto &resname = resource.first;
// auto &resconf = resource.second;
// auto type = resconf.GetValue(server::CONFIG_RESOURCE_TYPE);
//// auto memory = resconf.GetInt64Value(server::CONFIG_RESOURCE_MEMORY);
// auto device_id = resconf.GetInt64Value(server::CONFIG_RESOURCE_DEVICE_ID);
//// auto enable_loader = resconf.GetBoolValue(server::CONFIG_RESOURCE_ENABLE_LOADER);
// auto enable_loader = true;
// auto enable_executor = resconf.GetBoolValue(server::CONFIG_RESOURCE_ENABLE_EXECUTOR);
// auto pinned_memory = resconf.GetInt64Value(server::CONFIG_RESOURCE_PIN_MEMORY);
// auto temp_memory = resconf.GetInt64Value(server::CONFIG_RESOURCE_TEMP_MEMORY);
// auto resource_num = resconf.GetInt64Value(server::CONFIG_RESOURCE_NUM);
//
// auto res = ResMgrInst::GetInstance()->Add(ResourceFactory::Create(resname,
// type,
// device_id,
// enable_loader,
// enable_executor));
//
// if (res.lock()->type() == ResourceType::GPU) {
// auto pinned_memory = resconf.GetInt64Value(server::CONFIG_RESOURCE_PIN_MEMORY, 300);
// auto temp_memory = resconf.GetInt64Value(server::CONFIG_RESOURCE_TEMP_MEMORY, 300);
// auto resource_num = resconf.GetInt64Value(server::CONFIG_RESOURCE_NUM, 2);
// pinned_memory = 1024 * 1024 * pinned_memory;
// temp_memory = 1024 * 1024 * temp_memory;
// knowhere::FaissGpuResourceMgr::GetInstance().InitDevice(device_id,
// pinned_memory,
// temp_memory,
// resource_num);
// }
// }
//
// knowhere::FaissGpuResourceMgr::GetInstance().InitResource();
//
// auto connections = config.GetChild(server::CONFIG_RESOURCE_CONNECTIONS).GetChildren();
// if (connections.empty()) throw "connections config null exception";
// for (auto &conn : connections) {
// auto &connect_name = conn.first;
// auto &connect_conf = conn.second;
// auto connect_speed = connect_conf.GetInt64Value(server::CONFIG_SPEED_CONNECTIONS);
// auto connect_endpoint = connect_conf.GetValue(server::CONFIG_ENDPOINT_CONNECTIONS);
//
// std::string delimiter = "===";
// std::string left = connect_endpoint.substr(0, connect_endpoint.find(delimiter));
// std::string right = connect_endpoint.substr(connect_endpoint.find(delimiter) + 3,
// connect_endpoint.length());
//
// auto connection = Connection(connect_name, connect_speed);
// ResMgrInst::GetInstance()->Connect(left, right, connection);
// }
// } catch (const char *msg) {
// SERVER_LOG_ERROR << msg;
// // TODO: throw exception instead
// exit(-1);
//// throw std::exception();
// }
}
auto connection = Connection(connect_name, connect_speed);
ResMgrInst::GetInstance()->Connect(left, right, connection);
}
} catch (const char* msg) {
SERVER_LOG_ERROR << msg;
// TODO: throw exception instead
exit(-1);
// throw std::exception();
}
void
StartSchedulerService() {
load_simple_config();
// load_advance_config();
ResMgrInst::GetInstance()->Start();
SchedInst::GetInstance()->Start();
......
......@@ -7,6 +7,7 @@
#include "Utils.h"
#include <chrono>
#include <cuda_runtime.h>
namespace zilliz {
......@@ -21,6 +22,13 @@ get_current_timestamp() {
return millis;
}
uint64_t
get_num_gpu() {
int n_devices = 0;
cudaGetDeviceCount(&n_devices);
return n_devices;
}
}
}
}
\ No newline at end of file
......@@ -14,6 +14,9 @@ namespace engine {
uint64_t
get_current_timestamp();
uint64_t
get_num_gpu();
}
}
}
\ No newline at end of file
......@@ -378,6 +378,32 @@ ServerConfig::CheckEngineConfig() {
ErrorCode
ServerConfig::CheckResourceConfig() {
/*
resource_config:
mode: simple
resources:
- cpu
- gpu0
- gpu100
*/
bool okay = true;
server::ConfigNode &config = server::ServerConfig::GetInstance().GetConfig(server::CONFIG_RESOURCE);
auto mode = config.GetValue("mode", "simple");
if (mode != "simple") {
std::cerr << "ERROR: invalid resource config: mode is " << mode << std::endl;
okay = false;
}
auto resources = config.GetSequence("resources");
if (resources.empty()) {
std::cerr << "ERROR: invalid resource config: resources empty" << std::endl;
okay = false;
}
return (okay ? SERVER_SUCCESS : SERVER_INVALID_ARGUMENT);
}
//ErrorCode
//ServerConfig::CheckResourceConfig() {
/*
resource_config:
......@@ -422,142 +448,143 @@ ServerConfig::CheckResourceConfig() {
speed: 11000
endpoint: cpu===gpu0
*/
bool okay = true;
server::ConfigNode resource_config = GetConfig(CONFIG_RESOURCE);
if (resource_config.GetChildren().empty()) {
std::cerr << "ERROR: no context under resource" << std::endl;
okay = false;
}
auto resources = resource_config.GetChild(CONFIG_RESOURCES).GetChildren();
if (resources.empty()) {
std::cerr << "no resources specified" << std::endl;
okay = false;
}
bool resource_valid_flag = false;
bool hasDisk = false;
bool hasCPU = false;
bool hasExecutor = false;
std::set<std::string> resource_list;
for (auto &resource : resources) {
resource_list.emplace(resource.first);
auto &resource_conf = resource.second;
auto type = resource_conf.GetValue(CONFIG_RESOURCE_TYPE);
std::string device_id_str = resource_conf.GetValue(CONFIG_RESOURCE_DEVICE_ID, "0");
int32_t device_id = -1;
if (ValidationUtil::ValidateStringIsNumber(device_id_str) != SERVER_SUCCESS) {
std::cerr << "ERROR: device_id " << device_id_str << " is not a number" << std::endl;
okay = false;
} else {
device_id = std::stol(device_id_str);
}
std::string enable_executor_str = resource_conf.GetValue(CONFIG_RESOURCE_ENABLE_EXECUTOR, "off");
if (ValidationUtil::ValidateStringIsBool(enable_executor_str) != SERVER_SUCCESS) {
std::cerr << "ERROR: invalid enable_executor config: " << enable_executor_str << std::endl;
okay = false;
}
if (type == "DISK") {
hasDisk = true;
} else if (type == "CPU") {
hasCPU = true;
if (resource_conf.GetBoolValue(CONFIG_RESOURCE_ENABLE_EXECUTOR, false)) {
hasExecutor = true;
}
}
else if (type == "GPU") {
int build_index_gpu_index = GetConfig(CONFIG_DB).GetInt32Value(CONFIG_DB_BUILD_INDEX_GPU, 0);
if (device_id == build_index_gpu_index) {
resource_valid_flag = true;
}
if (resource_conf.GetBoolValue(CONFIG_RESOURCE_ENABLE_EXECUTOR, false)) {
hasExecutor = true;
}
std::string gpu_resource_num_str = resource_conf.GetValue(CONFIG_RESOURCE_NUM, "2");
if (ValidationUtil::ValidateStringIsNumber(gpu_resource_num_str) != SERVER_SUCCESS) {
std::cerr << "ERROR: gpu_resource_num " << gpu_resource_num_str << " is not a number" << std::endl;
okay = false;
}
bool mem_valid = true;
std::string pinned_memory_str = resource_conf.GetValue(CONFIG_RESOURCE_PIN_MEMORY, "300");
if (ValidationUtil::ValidateStringIsNumber(pinned_memory_str) != SERVER_SUCCESS) {
std::cerr << "ERROR: pinned_memory " << pinned_memory_str << " is not a number" << std::endl;
okay = false;
mem_valid = false;
}
std::string temp_memory_str = resource_conf.GetValue(CONFIG_RESOURCE_TEMP_MEMORY, "300");
if (ValidationUtil::ValidateStringIsNumber(temp_memory_str) != SERVER_SUCCESS) {
std::cerr << "ERROR: temp_memory " << temp_memory_str << " is not a number" << std::endl;
okay = false;
mem_valid = false;
}
if (mem_valid) {
size_t gpu_memory;
if (ValidationUtil::GetGpuMemory(device_id, gpu_memory) != SERVER_SUCCESS) {
std::cerr << "ERROR: could not get gpu memory for device " << device_id << std::endl;
okay = false;
}
else {
size_t prealoc_mem = std::stol(pinned_memory_str) + std::stol(temp_memory_str);
if (prealoc_mem >= gpu_memory) {
std::cerr << "ERROR: sum of pinned_memory and temp_memory " << prealoc_mem
<< " exceeds total gpu memory " << gpu_memory << " for device " << device_id << std::endl;
okay = false;
}
}
}
}
}
if (!resource_valid_flag) {
std::cerr << "Building index GPU can't be found in resource config." << std::endl;
okay = false;
}
if (!hasDisk || !hasCPU) {
std::cerr << "No DISK or CPU resource" << std::endl;
okay = false;
}
if (!hasExecutor) {
std::cerr << "No CPU or GPU resource has executor enabled" << std::endl;
okay = false;
}
auto connections = resource_config.GetChild(CONFIG_RESOURCE_CONNECTIONS).GetChildren();
for (auto &connection : connections) {
auto &connection_conf = connection.second;
std::string speed_str = connection_conf.GetValue(CONFIG_SPEED_CONNECTIONS);
if (ValidationUtil::ValidateStringIsNumber(speed_str) != SERVER_SUCCESS) {
std::cerr << "ERROR: speed " << speed_str << " is not a number" << std::endl;
okay = false;
}
std::string endpoint_str = connection_conf.GetValue(CONFIG_ENDPOINT_CONNECTIONS);
std::string delimiter = "===";
auto delimiter_pos = endpoint_str.find(delimiter);
if (delimiter_pos == std::string::npos) {
std::cerr << "ERROR: invalid endpoint format: " << endpoint_str << std::endl;
okay = false;
} else {
std::string left_resource = endpoint_str.substr(0, delimiter_pos);
if (resource_list.find(left_resource) == resource_list.end()) {
std::cerr << "ERROR: left resource " << left_resource << " does not exist" << std::endl;
okay = false;
}
std::string right_resource = endpoint_str.substr(delimiter_pos + delimiter.length(), endpoint_str.length());
if (resource_list.find(right_resource) == resource_list.end()) {
std::cerr << "ERROR: right resource " << right_resource << " does not exist" << std::endl;
okay = false;
}
}
}
return (okay ? SERVER_SUCCESS : SERVER_INVALID_ARGUMENT);
}
// bool okay = true;
// server::ConfigNode resource_config = GetConfig(CONFIG_RESOURCE);
// if (resource_config.GetChildren().empty()) {
// std::cerr << "ERROR: no context under resource" << std::endl;
// okay = false;
// }
//
// auto resources = resource_config.GetChild(CONFIG_RESOURCES).GetChildren();
//
// if (resources.empty()) {
// std::cerr << "no resources specified" << std::endl;
// okay = false;
// }
//
// bool resource_valid_flag = false;
// bool hasDisk = false;
// bool hasCPU = false;
// bool hasExecutor = false;
// std::set<std::string> resource_list;
// for (auto &resource : resources) {
// resource_list.emplace(resource.first);
// auto &resource_conf = resource.second;
// auto type = resource_conf.GetValue(CONFIG_RESOURCE_TYPE);
//
// std::string device_id_str = resource_conf.GetValue(CONFIG_RESOURCE_DEVICE_ID, "0");
// int32_t device_id = -1;
// if (ValidationUtil::ValidateStringIsNumber(device_id_str) != SERVER_SUCCESS) {
// std::cerr << "ERROR: device_id " << device_id_str << " is not a number" << std::endl;
// okay = false;
// } else {
// device_id = std::stol(device_id_str);
// }
//
// std::string enable_executor_str = resource_conf.GetValue(CONFIG_RESOURCE_ENABLE_EXECUTOR, "off");
// if (ValidationUtil::ValidateStringIsBool(enable_executor_str) != SERVER_SUCCESS) {
// std::cerr << "ERROR: invalid enable_executor config: " << enable_executor_str << std::endl;
// okay = false;
// }
//
// if (type == "DISK") {
// hasDisk = true;
// } else if (type == "CPU") {
// hasCPU = true;
// if (resource_conf.GetBoolValue(CONFIG_RESOURCE_ENABLE_EXECUTOR, false)) {
// hasExecutor = true;
// }
// }
// else if (type == "GPU") {
// int build_index_gpu_index = GetConfig(CONFIG_DB).GetInt32Value(CONFIG_DB_BUILD_INDEX_GPU, 0);
// if (device_id == build_index_gpu_index) {
// resource_valid_flag = true;
// }
// if (resource_conf.GetBoolValue(CONFIG_RESOURCE_ENABLE_EXECUTOR, false)) {
// hasExecutor = true;
// }
// std::string gpu_resource_num_str = resource_conf.GetValue(CONFIG_RESOURCE_NUM, "2");
// if (ValidationUtil::ValidateStringIsNumber(gpu_resource_num_str) != SERVER_SUCCESS) {
// std::cerr << "ERROR: gpu_resource_num " << gpu_resource_num_str << " is not a number" << std::endl;
// okay = false;
// }
// bool mem_valid = true;
// std::string pinned_memory_str = resource_conf.GetValue(CONFIG_RESOURCE_PIN_MEMORY, "300");
// if (ValidationUtil::ValidateStringIsNumber(pinned_memory_str) != SERVER_SUCCESS) {
// std::cerr << "ERROR: pinned_memory " << pinned_memory_str << " is not a number" << std::endl;
// okay = false;
// mem_valid = false;
// }
// std::string temp_memory_str = resource_conf.GetValue(CONFIG_RESOURCE_TEMP_MEMORY, "300");
// if (ValidationUtil::ValidateStringIsNumber(temp_memory_str) != SERVER_SUCCESS) {
// std::cerr << "ERROR: temp_memory " << temp_memory_str << " is not a number" << std::endl;
// okay = false;
// mem_valid = false;
// }
// if (mem_valid) {
// size_t gpu_memory;
// if (ValidationUtil::GetGpuMemory(device_id, gpu_memory) != SERVER_SUCCESS) {
// std::cerr << "ERROR: could not get gpu memory for device " << device_id << std::endl;
// okay = false;
// }
// else {
// size_t prealoc_mem = std::stol(pinned_memory_str) + std::stol(temp_memory_str);
// if (prealoc_mem >= gpu_memory) {
// std::cerr << "ERROR: sum of pinned_memory and temp_memory " << prealoc_mem
// << " exceeds total gpu memory " << gpu_memory << " for device " << device_id << std::endl;
// okay = false;
// }
// }
// }
// }
// }
//
// if (!resource_valid_flag) {
// std::cerr << "Building index GPU can't be found in resource config." << std::endl;
// okay = false;
// }
// if (!hasDisk || !hasCPU) {
// std::cerr << "No DISK or CPU resource" << std::endl;
// okay = false;
// }
// if (!hasExecutor) {
// std::cerr << "No CPU or GPU resource has executor enabled" << std::endl;
// okay = false;
// }
//
// auto connections = resource_config.GetChild(CONFIG_RESOURCE_CONNECTIONS).GetChildren();
// for (auto &connection : connections) {
// auto &connection_conf = connection.second;
//
// std::string speed_str = connection_conf.GetValue(CONFIG_SPEED_CONNECTIONS);
// if (ValidationUtil::ValidateStringIsNumber(speed_str) != SERVER_SUCCESS) {
// std::cerr << "ERROR: speed " << speed_str << " is not a number" << std::endl;
// okay = false;
// }
//
// std::string endpoint_str = connection_conf.GetValue(CONFIG_ENDPOINT_CONNECTIONS);
// std::string delimiter = "===";
// auto delimiter_pos = endpoint_str.find(delimiter);
// if (delimiter_pos == std::string::npos) {
// std::cerr << "ERROR: invalid endpoint format: " << endpoint_str << std::endl;
// okay = false;
// } else {
// std::string left_resource = endpoint_str.substr(0, delimiter_pos);
// if (resource_list.find(left_resource) == resource_list.end()) {
// std::cerr << "ERROR: left resource " << left_resource << " does not exist" << std::endl;
// okay = false;
// }
// std::string right_resource = endpoint_str.substr(delimiter_pos + delimiter.length(), endpoint_str.length());
// if (resource_list.find(right_resource) == resource_list.end()) {
// std::cerr << "ERROR: right resource " << right_resource << " does not exist" << std::endl;
// okay = false;
// }
// }
// }
//
// return (okay ? SERVER_SUCCESS : SERVER_INVALID_ARGUMENT);
// return SERVER_SUCCESS;
//}
void
ServerConfig::PrintAll() const {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册