提交 bf52f6ec 编写于 作者: W wxyu

Merge remote-tracking branch 'main/branch-0.4.0' into branch-0.4.0


Former-commit-id: 65dff7b2ae36eb956766bad30010360b9212a02e
......@@ -64,6 +64,13 @@ class FaissGpuResourceMgr {
void
MoveToIdle(const int64_t &device_id, const ResPtr& res);
void
Dump();
protected:
void
RemoveResource(const int64_t& device_id, const ResPtr& res, std::map<int64_t, std::vector<ResPtr>>& resource_pool);
protected:
bool is_init = false;
......
......@@ -19,6 +19,7 @@
#include "knowhere/adapter/faiss_adopt.h"
#include "knowhere/index/vector_index/gpu_ivf.h"
#include <algorithm>
namespace zilliz {
namespace knowhere {
......@@ -33,7 +34,7 @@ IndexModelPtr GPUIVF::Train(const DatasetPtr &dataset, const Config &config) {
auto temp_resource = FaissGpuResourceMgr::GetInstance().GetRes(gpu_device);
if (temp_resource != nullptr) {
ResScope rs(gpu_device, temp_resource );
ResScope rs(gpu_device, temp_resource);
faiss::gpu::GpuIndexIVFFlatConfig idx_config;
idx_config.device = gpu_device;
faiss::gpu::GpuIndexIVFFlat device_index(temp_resource->faiss_res.get(), dim, nlist, metric_type, idx_config);
......@@ -130,9 +131,9 @@ void GPUIVF::search_impl(int64_t n,
int64_t *labels,
const Config &cfg) {
// TODO(linxj): allocate mem
if (FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_, res_)) {
ResScope rs(gpu_id_, res_);
auto temp_res = FaissGpuResourceMgr::GetInstance().GetRes(gpu_id_);
if (temp_res) {
ResScope rs(gpu_id_, temp_res);
if (auto device_index = std::static_pointer_cast<faiss::gpu::GpuIndexIVF>(index_)) {
auto nprobe = cfg.get_with_default("nprobe", size_t(1));
......@@ -143,7 +144,6 @@ void GPUIVF::search_impl(int64_t n,
} else {
KNOWHERE_THROW_MSG("search can't get gpu resource");
}
}
VectorIndexPtr GPUIVF::CopyGpuToCpu(const Config &config) {
......@@ -229,8 +229,7 @@ IndexModelPtr GPUIVFSQ::Train(const DatasetPtr &dataset, const Config &config) {
delete build_index;
return std::make_shared<IVFIndexModel>(host_index);
}
else {
} else {
KNOWHERE_THROW_MSG("Build IVFSQ can't get gpu resource");
}
}
......@@ -279,6 +278,10 @@ void FaissGpuResourceMgr::InitDevice(int64_t device_id,
}
void FaissGpuResourceMgr::InitResource() {
if(is_init) return ;
is_init = true;
for(auto& device : devices_params_) {
auto& resource_vec = idle_[device.first];
......@@ -342,15 +345,29 @@ bool FaissGpuResourceMgr::GetRes(const int64_t &device_id,
void FaissGpuResourceMgr::MoveToInuse(const int64_t &device_id, const ResPtr &res) {
std::lock_guard<std::mutex> lk(mutex_);
RemoveResource(device_id, res, idle_);
in_use_[device_id].push_back(res);
}
void FaissGpuResourceMgr::MoveToIdle(const int64_t &device_id, const ResPtr &res) {
std::lock_guard<std::mutex> lk(mutex_);
RemoveResource(device_id, res, in_use_);
auto it = idle_[device_id].begin();
idle_[device_id].insert(it, res);
}
void
FaissGpuResourceMgr::RemoveResource(const int64_t &device_id,
const ResPtr &res,
std::map<int64_t, std::vector<ResPtr>> &resource_pool) {
if (resource_pool.find(device_id) != resource_pool.end()) {
std::vector<ResPtr> &res_array = resource_pool[device_id];
res_array.erase(std::remove_if(res_array.begin(), res_array.end(),
[&](ResPtr &ptr) { return ptr->id == res->id; }),
res_array.end());
}
}
void FaissGpuResourceMgr::Free() {
for (auto &item : in_use_) {
auto& res_vec = item.second;
......@@ -363,6 +380,25 @@ void FaissGpuResourceMgr::Free() {
is_init = false;
}
void
FaissGpuResourceMgr::Dump() {
std::cout << "In used resource" << std::endl;
for(auto& item: in_use_) {
std::cout << "device_id: " << item.first << std::endl;
for(auto& elem : item.second) {
std::cout << "resource_id: " << elem->id << std::endl;
}
}
std::cout << "Idle resource" << std::endl;
for(auto& item: idle_) {
std::cout << "device_id: " << item.first << std::endl;
for(auto& elem : item.second) {
std::cout << "resource_id: " << elem->id << std::endl;
}
}
}
void GPUIndex::SetGpuDevice(const int &gpu_id) {
gpu_id_ = gpu_id;
}
......
......@@ -61,9 +61,9 @@ ServerConfig::LoadConfigFile(const std::string& config_filename) {
ErrorCode ServerConfig::ValidateConfig() const {
//server config validation
ConfigNode server_config = GetConfig(CONFIG_SERVER);
uint32_t gpu_index = (uint32_t)server_config.GetInt32Value(CONFIG_GPU_INDEX, 0);
if(ValidationUtil::ValidateGpuIndex(gpu_index) != SERVER_SUCCESS) {
std::cout << "Error: invalid gpu_index " << std::to_string(gpu_index) << std::endl;
uint32_t build_index_gpu_index = (uint32_t)server_config.GetInt32Value(CONFIG_GPU_INDEX, 0);
if(ValidationUtil::ValidateGpuIndex(build_index_gpu_index) != SERVER_SUCCESS) {
std::cerr << "Error: invalid gpu_index " << std::to_string(build_index_gpu_index) << std::endl;
return SERVER_INVALID_ARGUMENT;
}
......@@ -75,7 +75,7 @@ ErrorCode ServerConfig::ValidateConfig() const {
uint64_t insert_buffer_size = (uint64_t)db_config.GetInt32Value(CONFIG_DB_INSERT_BUFFER_SIZE, 4);
insert_buffer_size *= GB;
if(insert_buffer_size >= total_mem) {
std::cout << "Error: insert_buffer_size execeed system memory" << std::endl;
std::cerr << "Error: insert_buffer_size execeed system memory" << std::endl;
return SERVER_INVALID_ARGUMENT;
}
......@@ -84,20 +84,51 @@ ErrorCode ServerConfig::ValidateConfig() const {
uint64_t cache_cap = (uint64_t)cache_config.GetInt64Value(CONFIG_CPU_CACHE_CAPACITY, 16);
cache_cap *= GB;
if(cache_cap >= total_mem) {
std::cout << "Error: cpu_cache_capacity execeed system memory" << std::endl;
std::cerr << "Error: cpu_cache_capacity execeed system memory" << std::endl;
return SERVER_INVALID_ARGUMENT;
} if(cache_cap > (double)total_mem*0.9) {
std::cout << "Warnning: cpu_cache_capacity value is too aggressive" << std::endl;
std::cerr << "Warning: cpu_cache_capacity value is too aggressive" << std::endl;
}
if(insert_buffer_size + cache_cap >= total_mem) {
std::cout << "Error: sum of cpu_cache_capacity and insert_buffer_size execeed system memory" << std::endl;
std::cerr << "Error: sum of cpu_cache_capacity and insert_buffer_size execeed system memory" << std::endl;
return SERVER_INVALID_ARGUMENT;
}
double free_percent = cache_config.GetDoubleValue(server::CACHE_FREE_PERCENT, 0.85);
if(free_percent < std::numeric_limits<double>::epsilon() || free_percent > 1.0) {
std::cout << "Error: invalid cache_free_percent " << std::to_string(free_percent) << std::endl;
std::cerr << "Error: invalid cache_free_percent " << std::to_string(free_percent) << std::endl;
return SERVER_INVALID_ARGUMENT;
}
// Resource config validation
server::ConfigNode &config = server::ServerConfig::GetInstance().GetConfig(server::CONFIG_RESOURCE);
if (config.GetChildren().empty()) {
std::cerr << "Error: no context under resource" << std::endl;
return SERVER_INVALID_ARGUMENT;
}
auto resources = config.GetChild(server::CONFIG_RESOURCES).GetChildren();
if (resources.empty()) {
std::cerr << "Children of resource_config null exception" << std::endl;
return SERVER_INVALID_ARGUMENT;
}
bool resource_valid_flag = false;
for (auto &resource : resources) {
auto &resconf = resource.second;
auto type = resconf.GetValue(server::CONFIG_RESOURCE_TYPE);
if(type == "GPU") {
auto device_id = resconf.GetInt64Value(server::CONFIG_RESOURCE_DEVICE_ID, 0);
if(device_id == build_index_gpu_index) {
resource_valid_flag = true;
}
}
}
if(!resource_valid_flag) {
std::cerr << "Building index GPU can't be found in resource config." << std::endl;
return SERVER_INVALID_ARGUMENT;
}
......
......@@ -78,6 +78,7 @@ constexpr ErrorCode DB_INVALID_PATH = ToDbErrorCode(5);
constexpr ErrorCode KNOWHERE_ERROR = ToKnowhereErrorCode(1);
constexpr ErrorCode KNOWHERE_INVALID_ARGUMENT = ToKnowhereErrorCode(2);
constexpr ErrorCode KNOWHERE_UNEXPECTED_ERROR = ToKnowhereErrorCode(3);
constexpr ErrorCode KNOWHERE_NO_SPACE = ToKnowhereErrorCode(4);
namespace server {
class ServerException : public std::exception {
......
......@@ -38,6 +38,9 @@ public:
static ErrorCode
GetGpuMemory(uint32_t gpu_index, size_t &memory);
static ErrorCode
ValidateConfig();
};
}
......
......@@ -140,7 +140,7 @@ VecIndexPtr read_index(const std::string &location) {
FileIOReader reader(location);
reader.fs.seekg(0, reader.fs.end);
int64_t length = reader.fs.tellg();
if(length <= 0) {
if (length <= 0) {
return nullptr;
}
......@@ -201,7 +201,13 @@ ErrorCode write_index(VecIndexPtr index, const std::string &location) {
return KNOWHERE_UNEXPECTED_ERROR;
} catch (std::exception &e) {
WRAPPER_LOG_ERROR << e.what();
return KNOWHERE_ERROR;
std::string estring(e.what());
if (estring.find("No space left on device") != estring.npos) {
WRAPPER_LOG_ERROR << "No space left on the device";
return KNOWHERE_NO_SPACE;
} else {
return KNOWHERE_ERROR;
}
}
return KNOWHERE_SUCCESS;
}
......@@ -213,7 +219,7 @@ void AutoGenParams(const IndexType &type, const long &size, zilliz::knowhere::Co
if (size <= TYPICAL_COUNT / 16384 + 1) {
//handle less row count, avoid nlist set to 0
cfg["nlist"] = 1;
} else if (int(size / TYPICAL_COUNT) * nlist == 0) {
} else if (int(size / TYPICAL_COUNT) *nlist == 0) {
//calculate a proper nlist if nlist not specified or size less than TYPICAL_COUNT
cfg["nlist"] = int(size / TYPICAL_COUNT * 16384);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册