diff --git a/ci/jenkinsfile/dev_test.groovy b/ci/jenkinsfile/dev_test.groovy index 2aa3f4dd7fcf6168f895e9854267eda40e8b49de..fc3badae0c573451d3b9ecdfa053b10959ca2461 100644 --- a/ci/jenkinsfile/dev_test.groovy +++ b/ci/jenkinsfile/dev_test.groovy @@ -1,4 +1,4 @@ -timeout(time: 30, unit: 'MINUTES') { +timeout(time: 40, unit: 'MINUTES') { try { dir ("${PROJECT_NAME}_test") { checkout([$class: 'GitSCM', branches: [[name: "${SEMVER}"]], doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], userRemoteConfigs: [[credentialsId: "${params.GIT_USER}", url: "git@192.168.1.105:Test/milvus_test.git", name: 'origin', refspec: "+refs/heads/${SEMVER}:refs/remotes/origin/${SEMVER}"]]]) diff --git a/cpp/CHANGELOG.md b/cpp/CHANGELOG.md index 635c213fb067d7d13b14baef1eb0527ec8d9e157..b8d93c87601cae90f5911fbf6b9e3535fd845eed 100644 --- a/cpp/CHANGELOG.md +++ b/cpp/CHANGELOG.md @@ -50,7 +50,11 @@ Please mark all change in change log and use the ticket from JIRA. - MS-261 - Update faiss version to 1.5.3 and add BUILD_FAISS_WITH_MKL as an option - MS-266 - Improve topk reduce time by using multi-threads - MS-275 - Avoid sqlite logic error excetion -- MS-278 - add IndexStatsHelper +- MS-278 - Add IndexStatsHelper +- MS-312 - Set openmp thread number by config +- MS-305 - Add CPU core percent metric +- MS-310 - Add milvus CPU utilization ratio and CPU/GPU temperature metrics +- MS-324 - Show error when there is not enough gpu memory to build index ## New Feature - MS-180 - Add new mem manager diff --git a/cpp/conf/server_config.template b/cpp/conf/server_config.template index 63090c62c73199146206a1004d5cd2ab493d6750..c80e981bcd002e6d4d151c0231add6d852d3cd97 100644 --- a/cpp/conf/server_config.template +++ b/cpp/conf/server_config.template @@ -43,4 +43,5 @@ engine_config: nprobe: 10 nlist: 16384 use_blas_threshold: 20 - metric_type: L2 # compare vectors by euclidean distance(L2) or inner product(IP), optional: L2 or IP + metric_type: L2 # compare vectors by euclidean distance(L2) or inner product(IP), optional: L2 or IP + omp_thread_num: 0 # how many compute threads be used by engine, 0 means use all cpu core to compute diff --git a/cpp/src/db/DBImpl.cpp b/cpp/src/db/DBImpl.cpp index 444f1b0941b3a4defdccbc7cc9c74b6d3d55ac7a..17537556b5b1a5e4f905ca6bed52dac0cb13b9ee 100644 --- a/cpp/src/db/DBImpl.cpp +++ b/cpp/src/db/DBImpl.cpp @@ -326,7 +326,8 @@ void DBImpl::StartMetricTask() { server::Metrics::GetInstance().OctetsSet(); server::Metrics::GetInstance().CPUCoreUsagePercentSet(); - + server::Metrics::GetInstance().GPUTemperature(); + server::Metrics::GetInstance().CPUTemperature(); ENGINE_LOG_TRACE << "Metric task finished"; } @@ -541,11 +542,27 @@ Status DBImpl::BuildIndex(const meta::TableFileSchema& file) { } //step 3: build index - auto start_time = METRICS_NOW_TIME; - auto index = to_index->BuildIndex(table_file.location_); - auto end_time = METRICS_NOW_TIME; - auto total_time = METRICS_MICROSECONDS(start_time, end_time); - server::Metrics::GetInstance().BuildIndexDurationSecondsHistogramObserve(total_time); + std::shared_ptr index; + + try { + auto start_time = METRICS_NOW_TIME; + index = to_index->BuildIndex(table_file.location_); + auto end_time = METRICS_NOW_TIME; + auto total_time = METRICS_MICROSECONDS(start_time, end_time); + server::Metrics::GetInstance().BuildIndexDurationSecondsHistogramObserve(total_time); + } catch (std::exception& ex) { + //typical error: out of gpu memory + std::string msg = "BuildIndex encounter exception" + std::string(ex.what()); + ENGINE_LOG_ERROR << msg; + + table_file.file_type_ = meta::TableFileSchema::TO_DELETE; + status = meta_ptr_->UpdateTableFile(table_file); + ENGINE_LOG_DEBUG << "Failed to update file to index, mark file: " << table_file.file_id_ << " to to_delete"; + + std::cout << "ERROR: failed to build index, index file is too large or gpu memory is not enough" << std::endl; + + return Status::Error(msg); + } //step 4: if table has been deleted, dont save index file bool has_table = false; @@ -556,7 +573,22 @@ Status DBImpl::BuildIndex(const meta::TableFileSchema& file) { } //step 5: save index file - index->Serialize(); + try { + index->Serialize(); + } catch (std::exception& ex) { + //typical error: out of disk space or permition denied + std::string msg = "Serialize index encounter exception" + std::string(ex.what()); + ENGINE_LOG_ERROR << msg; + + table_file.file_type_ = meta::TableFileSchema::TO_DELETE; + status = meta_ptr_->UpdateTableFile(table_file); + ENGINE_LOG_DEBUG << "Failed to update file to index, mark file: " << table_file.file_id_ << " to to_delete"; + + std::cout << "ERROR: failed to persist index file: " << table_file.location_ + << ", possible out of disk space" << std::endl; + + return Status::Error(msg); + } //step 6: update meta table_file.file_type_ = meta::TableFileSchema::INDEX; diff --git a/cpp/src/db/DBMetaImpl.cpp b/cpp/src/db/DBMetaImpl.cpp index f8cc34a146783a04e46d617394784d56ae104514..c629f378161020a9eb680911a61b4dca0efe44ec 100644 --- a/cpp/src/db/DBMetaImpl.cpp +++ b/cpp/src/db/DBMetaImpl.cpp @@ -1005,7 +1005,7 @@ Status DBMetaImpl::CleanUpFilesWithTTL(uint16_t seconds) { table_file.date_ = std::get<3>(file); utils::DeleteTableFilePath(options_, table_file); - ENGINE_LOG_DEBUG << "Removing file id:" << table_file.id_ << " location:" << table_file.location_; + ENGINE_LOG_DEBUG << "Removing file id:" << table_file.file_id_ << " location:" << table_file.location_; ConnectorPtr->remove(table_file.id_); } diff --git a/cpp/src/metrics/MetricBase.h b/cpp/src/metrics/MetricBase.h index 61e9e7680f9e814f1f6aa4be41b589ed55b11cf6..23a2427b3537f66a0747c1d6ba25c172014bc922 100644 --- a/cpp/src/metrics/MetricBase.h +++ b/cpp/src/metrics/MetricBase.h @@ -66,6 +66,8 @@ class MetricsBase{ virtual void OctetsSet() {}; virtual void CPUCoreUsagePercentSet() {}; + virtual void GPUTemperature() {}; + virtual void CPUTemperature() {}; }; diff --git a/cpp/src/metrics/PrometheusMetrics.cpp b/cpp/src/metrics/PrometheusMetrics.cpp index 3d83bff864bbf26dd8b3f405e4d600c9415d5dbf..c7729ffdbca62408ffad4ecb8379266a690b03ca 100644 --- a/cpp/src/metrics/PrometheusMetrics.cpp +++ b/cpp/src/metrics/PrometheusMetrics.cpp @@ -34,8 +34,6 @@ PrometheusMetrics::Init() { return SERVER_UNEXPECTED_ERROR; } - // - return SERVER_SUCCESS; } @@ -44,8 +42,6 @@ PrometheusMetrics::Init() { void PrometheusMetrics::CPUUsagePercentSet() { if(!startup_) return ; - int numProcessor = server::SystemInfo::GetInstance().num_processor(); - double usage_percent = server::SystemInfo::GetInstance().CPUPercent(); CPU_usage_percent_.Set(usage_percent); } @@ -64,13 +60,11 @@ PrometheusMetrics::GPUPercentGaugeSet() { std::vector used_total = server::SystemInfo::GetInstance().GPUMemoryTotal(); std::vector used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed(); - - for (int i = 0; i < numDevice; i++) { + for (int i = 0; i < numDevice; ++i) { prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}}); double percent = (double)used_memory[i] / (double)used_total[i]; GPU_percent.Set(percent * 100); } - } void PrometheusMetrics::GPUMemoryUsageGaugeSet() { @@ -79,7 +73,7 @@ void PrometheusMetrics::GPUMemoryUsageGaugeSet() { constexpr unsigned long long MtoB = 1024*1024; int numDevice = server::SystemInfo::GetInstance().num_device(); - for (int i = 0; i < numDevice; i++) { + for (int i = 0; i < numDevice; ++i) { prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}}); GPU_memory.Set(values[i] / MtoB); } @@ -142,12 +136,35 @@ void PrometheusMetrics::CPUCoreUsagePercentSet() { std::vector cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent(); - for (int i = 0; i < cpu_core_percent.size(); i++) { + for (int i = 0; i < cpu_core_percent.size(); ++i) { prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}}); core_percent.Set(cpu_core_percent[i]); } } +void PrometheusMetrics::GPUTemperature() { + if (!startup_) + return; + + std::vector GPU_temperatures = server::SystemInfo::GetInstance().GPUTemperature(); + + for (int i = 0; i < GPU_temperatures.size(); ++i) { + prometheus::Gauge &gpu_temp = GPU_temperature_.Add({{"GPU", std::to_string(i)}}); + gpu_temp.Set(GPU_temperatures[i]); + } +} + +void PrometheusMetrics::CPUTemperature() { + if (!startup_) + return; + + std::vector CPU_temperatures = server::SystemInfo::GetInstance().CPUTemperature(); + + for (int i = 0; i < CPU_temperatures.size(); ++i) { + prometheus::Gauge &cpu_temp = CPU_temperature_.Add({{"CPU", std::to_string(i)}}); + cpu_temp.Set(CPU_temperatures[i]); + } +} } } diff --git a/cpp/src/metrics/PrometheusMetrics.h b/cpp/src/metrics/PrometheusMetrics.h index 590130f4448ad4e2fe4f3af1d95c44221f3840aa..282c58800ca5bf08ecdd9f0af123ee943dbf2904 100644 --- a/cpp/src/metrics/PrometheusMetrics.h +++ b/cpp/src/metrics/PrometheusMetrics.h @@ -79,7 +79,6 @@ class PrometheusMetrics: public MetricsBase { void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);}; void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);}; void CPUUsagePercentSet() override ; - void CPUCoreUsagePercentSet() override; void RAMUsagePercentSet() override ; @@ -93,6 +92,9 @@ class PrometheusMetrics: public MetricsBase { void KeepingAliveCounterIncrement(double value = 1) override {if(startup_) keeping_alive_counter_.Increment(value);}; void OctetsSet() override ; + void GPUTemperature() override; + void CPUTemperature() override; + @@ -396,7 +398,7 @@ class PrometheusMetrics: public MetricsBase { .Name("CPU_usage_percent") .Help("CPU usage percent by this this process") .Register(*registry_); - prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "0"}}); + prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "avg"}}); prometheus::Family &RAM_ = prometheus::BuildGauge() @@ -444,6 +446,15 @@ class PrometheusMetrics: public MetricsBase { prometheus::Gauge &outoctets_gauge_ = octets_.Add({{"type", "outoctets"}}); + prometheus::Family &GPU_temperature_ = prometheus::BuildGauge() + .Name("GPU_temperature") + .Help("GPU temperature") + .Register(*registry_); + + prometheus::Family &CPU_temperature_ = prometheus::BuildGauge() + .Name("CPU_temperature") + .Help("CPU temperature") + .Register(*registry_); }; diff --git a/cpp/src/metrics/SystemInfo.cpp b/cpp/src/metrics/SystemInfo.cpp index 7628db78bbb7bbc5b2fbc5c0a8bf7f347534ba73..3b6698d42bfb8896e537c9e55f78585223f6de90 100644 --- a/cpp/src/metrics/SystemInfo.cpp +++ b/cpp/src/metrics/SystemInfo.cpp @@ -36,6 +36,9 @@ void SystemInfo::Init() { num_processors_ = 0; while(fgets(line, 128, file) != NULL){ if (strncmp(line, "processor", 9) == 0) num_processors_++; + if (strncmp(line, "physical", 8) == 0) { + num_physical_processors_ = ParseLine(line); + } } total_ram_ = GetPhysicalMemory(); fclose(file); @@ -108,8 +111,6 @@ SystemInfo::MemoryPercent() { return (double)(GetProcessUsedMemory()*100)/(double)total_ram_; } - - std::vector SystemInfo::CPUCorePercent() { std::vector prev_work_time_array; @@ -119,7 +120,7 @@ SystemInfo::CPUCorePercent() { std::vector cur_total_time_array = getTotalCpuTime(cur_work_time_array); std::vector cpu_core_percent; - for (int i = 0; i < num_processors_; i++) { + for (int i = 1; i < num_processors_; i++) { double total_cpu_time = cur_total_time_array[i] - prev_total_time_array[i]; double cpu_work_time = cur_work_time_array[i] - prev_work_time_array[i]; cpu_core_percent.push_back((cpu_work_time / total_cpu_time) * 100); @@ -181,7 +182,6 @@ SystemInfo::CPUPercent() { percent = (time_sample.tms_stime - last_sys_cpu_) + (time_sample.tms_utime - last_user_cpu_); percent /= (now - last_cpu_); - percent /= num_processors_; percent *= 100; } last_cpu_ = now; @@ -207,6 +207,36 @@ SystemInfo::GPUMemoryTotal() { return result; } +std::vector +SystemInfo::GPUTemperature(){ + if(!initialized_) Init(); + std::vector result; + for (int i = 0; i < num_device_; i++) { + nvmlDevice_t device; + nvmlDeviceGetHandleByIndex(i, &device); + unsigned int temp; + nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU,&temp); + result.push_back(temp); + } + return result; +} +std::vector +SystemInfo::CPUTemperature(){ + std::vector result; + for (int i = 0; i <= num_physical_processors_; ++i) { + std::string path = "/sys/class/thermal/thermal_zone" + std::to_string(i) + "/temp"; + FILE *file = fopen(path.data(), "r"); + if (file == NULL) { + perror("Could not open thermal file"); + return result; + } + float temp; + fscanf(file, "%f", &temp); + result.push_back(temp / 1000); + } + +} + std::vector SystemInfo::GPUMemoryUsed() { // get GPU memory used diff --git a/cpp/src/metrics/SystemInfo.h b/cpp/src/metrics/SystemInfo.h index 629aaf72201d03318b0291ede89ceb78c6fc06df..ab27375c73828c505824ed70a637bef61256cce5 100644 --- a/cpp/src/metrics/SystemInfo.h +++ b/cpp/src/metrics/SystemInfo.h @@ -32,6 +32,7 @@ class SystemInfo { clock_t last_user_cpu_ = clock_t(); std::chrono::system_clock::time_point net_time_ = std::chrono::system_clock::now(); int num_processors_ = 0; + int num_physical_processors_ = 0; //number of GPU unsigned int num_device_ = 0; unsigned long long in_octets_ = 0; @@ -47,6 +48,7 @@ class SystemInfo { void Init(); int num_processor() const { return num_processors_;}; + int num_physical_processors() const { return num_physical_processors_; }; int num_device() const {return num_device_;}; unsigned long long get_inoctets() { return in_octets_;}; unsigned long long get_octets() { return out_octets_;}; @@ -65,7 +67,8 @@ class SystemInfo { std::vector CPUCorePercent(); std::vector getTotalCpuTime(std::vector &workTime); - + std::vector GPUTemperature(); + std::vector CPUTemperature(); }; diff --git a/cpp/src/server/DBWrapper.cpp b/cpp/src/server/DBWrapper.cpp index e908f62d7c4390606ff2205fc3f422aa8367536a..4487a37e198ff5291e0779c9d29d596f02e202db 100644 --- a/cpp/src/server/DBWrapper.cpp +++ b/cpp/src/server/DBWrapper.cpp @@ -10,11 +10,14 @@ #include "utils/Log.h" #include "utils/StringHelpFunctions.h" +#include + namespace zilliz { namespace milvus { namespace server { DBWrapper::DBWrapper() { + //db config zilliz::milvus::engine::Options opt; ConfigNode& db_config = ServerConfig::GetInstance().GetConfig(CONFIG_DB); opt.meta.backend_uri = db_config.GetValue(CONFIG_DB_URL); @@ -37,6 +40,7 @@ DBWrapper::DBWrapper() { kill(0, SIGUSR1); } + // cache config ConfigNode& cache_config = ServerConfig::GetInstance().GetConfig(CONFIG_CACHE); opt.insert_cache_immediately_ = cache_config.GetBoolValue(CONFIG_INSERT_CACHE_IMMEDIATELY, false); @@ -56,6 +60,14 @@ DBWrapper::DBWrapper() { kill(0, SIGUSR1); } + // engine config + ConfigNode& engine_config = ServerConfig::GetInstance().GetConfig(CONFIG_ENGINE); + int32_t omp_thread = engine_config.GetInt32Value(CONFIG_OMP_THREAD_NUM, 0); + if(omp_thread > 0) { + omp_set_num_threads(omp_thread); + SERVER_LOG_DEBUG << "Specify openmp thread number: " << omp_thread; + } + //set archive config engine::ArchiveConf::CriteriaT criterial; int64_t disk = db_config.GetInt64Value(CONFIG_DB_ARCHIVE_DISK, 0); diff --git a/cpp/src/server/ServerConfig.h b/cpp/src/server/ServerConfig.h index bb7d5d366928ab7b1edad4bea5b0e23cc60fdd21..e899d1f8d6d4c1893dbd5788a85aec73f12b6713 100644 --- a/cpp/src/server/ServerConfig.h +++ b/cpp/src/server/ServerConfig.h @@ -53,6 +53,7 @@ static const std::string CONFIG_NPROBE = "nprobe"; static const std::string CONFIG_NLIST = "nlist"; static const std::string CONFIG_DCBT = "use_blas_threshold"; static const std::string CONFIG_METRICTYPE = "metric_type"; +static const std::string CONFIG_OMP_THREAD_NUM = "omp_thread_num"; class ServerConfig { public: