diff --git a/cpp/CHANGELOG.md b/cpp/CHANGELOG.md index 5c5d7ccb0cb62eb287e55a9bc141bbfa19954d43..ea37ff28c8e965ceeb489bb74df0457756dbdb87 100644 --- a/cpp/CHANGELOG.md +++ b/cpp/CHANGELOG.md @@ -41,6 +41,8 @@ Please mark all change in change log and use the ticket from JIRA. - MS-275 - Avoid sqlite logic error excetion - MS-278 - add IndexStatsHelper - MS-312 - Set openmp thread number by config +- MS-305 - add CPU core percent metric +- MS-310 - add milvus CPU utilization ratio and CPU/GPU temperature metrics ## New Feature - MS-180 - Add new mem manager diff --git a/cpp/src/db/DBImpl.cpp b/cpp/src/db/DBImpl.cpp index 6207c7108c5b2de29b51b44fb869627256cd42ff..fa4066e27c88503680d717f334bc3c89855f4380 100644 --- a/cpp/src/db/DBImpl.cpp +++ b/cpp/src/db/DBImpl.cpp @@ -319,7 +319,8 @@ void DBImpl::StartMetricTask() { server::Metrics::GetInstance().OctetsSet(); server::Metrics::GetInstance().CPUCoreUsagePercentSet(); - + server::Metrics::GetInstance().GPUTemperature(); + server::Metrics::GetInstance().CPUTemperature(); ENGINE_LOG_TRACE << "Metric task finished"; } diff --git a/cpp/src/metrics/MetricBase.h b/cpp/src/metrics/MetricBase.h index 61e9e7680f9e814f1f6aa4be41b589ed55b11cf6..23a2427b3537f66a0747c1d6ba25c172014bc922 100644 --- a/cpp/src/metrics/MetricBase.h +++ b/cpp/src/metrics/MetricBase.h @@ -66,6 +66,8 @@ class MetricsBase{ virtual void OctetsSet() {}; virtual void CPUCoreUsagePercentSet() {}; + virtual void GPUTemperature() {}; + virtual void CPUTemperature() {}; }; diff --git a/cpp/src/metrics/PrometheusMetrics.cpp b/cpp/src/metrics/PrometheusMetrics.cpp index 3d83bff864bbf26dd8b3f405e4d600c9415d5dbf..c7729ffdbca62408ffad4ecb8379266a690b03ca 100644 --- a/cpp/src/metrics/PrometheusMetrics.cpp +++ b/cpp/src/metrics/PrometheusMetrics.cpp @@ -34,8 +34,6 @@ PrometheusMetrics::Init() { return SERVER_UNEXPECTED_ERROR; } - // - return SERVER_SUCCESS; } @@ -44,8 +42,6 @@ PrometheusMetrics::Init() { void PrometheusMetrics::CPUUsagePercentSet() { if(!startup_) return ; - int numProcessor = server::SystemInfo::GetInstance().num_processor(); - double usage_percent = server::SystemInfo::GetInstance().CPUPercent(); CPU_usage_percent_.Set(usage_percent); } @@ -64,13 +60,11 @@ PrometheusMetrics::GPUPercentGaugeSet() { std::vector used_total = server::SystemInfo::GetInstance().GPUMemoryTotal(); std::vector used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed(); - - for (int i = 0; i < numDevice; i++) { + for (int i = 0; i < numDevice; ++i) { prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}}); double percent = (double)used_memory[i] / (double)used_total[i]; GPU_percent.Set(percent * 100); } - } void PrometheusMetrics::GPUMemoryUsageGaugeSet() { @@ -79,7 +73,7 @@ void PrometheusMetrics::GPUMemoryUsageGaugeSet() { constexpr unsigned long long MtoB = 1024*1024; int numDevice = server::SystemInfo::GetInstance().num_device(); - for (int i = 0; i < numDevice; i++) { + for (int i = 0; i < numDevice; ++i) { prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}}); GPU_memory.Set(values[i] / MtoB); } @@ -142,12 +136,35 @@ void PrometheusMetrics::CPUCoreUsagePercentSet() { std::vector cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent(); - for (int i = 0; i < cpu_core_percent.size(); i++) { + for (int i = 0; i < cpu_core_percent.size(); ++i) { prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}}); core_percent.Set(cpu_core_percent[i]); } } +void PrometheusMetrics::GPUTemperature() { + if (!startup_) + return; + + std::vector GPU_temperatures = server::SystemInfo::GetInstance().GPUTemperature(); + + for (int i = 0; i < GPU_temperatures.size(); ++i) { + prometheus::Gauge &gpu_temp = GPU_temperature_.Add({{"GPU", std::to_string(i)}}); + gpu_temp.Set(GPU_temperatures[i]); + } +} + +void PrometheusMetrics::CPUTemperature() { + if (!startup_) + return; + + std::vector CPU_temperatures = server::SystemInfo::GetInstance().CPUTemperature(); + + for (int i = 0; i < CPU_temperatures.size(); ++i) { + prometheus::Gauge &cpu_temp = CPU_temperature_.Add({{"CPU", std::to_string(i)}}); + cpu_temp.Set(CPU_temperatures[i]); + } +} } } diff --git a/cpp/src/metrics/PrometheusMetrics.h b/cpp/src/metrics/PrometheusMetrics.h index 590130f4448ad4e2fe4f3af1d95c44221f3840aa..282c58800ca5bf08ecdd9f0af123ee943dbf2904 100644 --- a/cpp/src/metrics/PrometheusMetrics.h +++ b/cpp/src/metrics/PrometheusMetrics.h @@ -79,7 +79,6 @@ class PrometheusMetrics: public MetricsBase { void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);}; void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);}; void CPUUsagePercentSet() override ; - void CPUCoreUsagePercentSet() override; void RAMUsagePercentSet() override ; @@ -93,6 +92,9 @@ class PrometheusMetrics: public MetricsBase { void KeepingAliveCounterIncrement(double value = 1) override {if(startup_) keeping_alive_counter_.Increment(value);}; void OctetsSet() override ; + void GPUTemperature() override; + void CPUTemperature() override; + @@ -396,7 +398,7 @@ class PrometheusMetrics: public MetricsBase { .Name("CPU_usage_percent") .Help("CPU usage percent by this this process") .Register(*registry_); - prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "0"}}); + prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "avg"}}); prometheus::Family &RAM_ = prometheus::BuildGauge() @@ -444,6 +446,15 @@ class PrometheusMetrics: public MetricsBase { prometheus::Gauge &outoctets_gauge_ = octets_.Add({{"type", "outoctets"}}); + prometheus::Family &GPU_temperature_ = prometheus::BuildGauge() + .Name("GPU_temperature") + .Help("GPU temperature") + .Register(*registry_); + + prometheus::Family &CPU_temperature_ = prometheus::BuildGauge() + .Name("CPU_temperature") + .Help("CPU temperature") + .Register(*registry_); }; diff --git a/cpp/src/metrics/SystemInfo.cpp b/cpp/src/metrics/SystemInfo.cpp index 7628db78bbb7bbc5b2fbc5c0a8bf7f347534ba73..3b6698d42bfb8896e537c9e55f78585223f6de90 100644 --- a/cpp/src/metrics/SystemInfo.cpp +++ b/cpp/src/metrics/SystemInfo.cpp @@ -36,6 +36,9 @@ void SystemInfo::Init() { num_processors_ = 0; while(fgets(line, 128, file) != NULL){ if (strncmp(line, "processor", 9) == 0) num_processors_++; + if (strncmp(line, "physical", 8) == 0) { + num_physical_processors_ = ParseLine(line); + } } total_ram_ = GetPhysicalMemory(); fclose(file); @@ -108,8 +111,6 @@ SystemInfo::MemoryPercent() { return (double)(GetProcessUsedMemory()*100)/(double)total_ram_; } - - std::vector SystemInfo::CPUCorePercent() { std::vector prev_work_time_array; @@ -119,7 +120,7 @@ SystemInfo::CPUCorePercent() { std::vector cur_total_time_array = getTotalCpuTime(cur_work_time_array); std::vector cpu_core_percent; - for (int i = 0; i < num_processors_; i++) { + for (int i = 1; i < num_processors_; i++) { double total_cpu_time = cur_total_time_array[i] - prev_total_time_array[i]; double cpu_work_time = cur_work_time_array[i] - prev_work_time_array[i]; cpu_core_percent.push_back((cpu_work_time / total_cpu_time) * 100); @@ -181,7 +182,6 @@ SystemInfo::CPUPercent() { percent = (time_sample.tms_stime - last_sys_cpu_) + (time_sample.tms_utime - last_user_cpu_); percent /= (now - last_cpu_); - percent /= num_processors_; percent *= 100; } last_cpu_ = now; @@ -207,6 +207,36 @@ SystemInfo::GPUMemoryTotal() { return result; } +std::vector +SystemInfo::GPUTemperature(){ + if(!initialized_) Init(); + std::vector result; + for (int i = 0; i < num_device_; i++) { + nvmlDevice_t device; + nvmlDeviceGetHandleByIndex(i, &device); + unsigned int temp; + nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU,&temp); + result.push_back(temp); + } + return result; +} +std::vector +SystemInfo::CPUTemperature(){ + std::vector result; + for (int i = 0; i <= num_physical_processors_; ++i) { + std::string path = "/sys/class/thermal/thermal_zone" + std::to_string(i) + "/temp"; + FILE *file = fopen(path.data(), "r"); + if (file == NULL) { + perror("Could not open thermal file"); + return result; + } + float temp; + fscanf(file, "%f", &temp); + result.push_back(temp / 1000); + } + +} + std::vector SystemInfo::GPUMemoryUsed() { // get GPU memory used diff --git a/cpp/src/metrics/SystemInfo.h b/cpp/src/metrics/SystemInfo.h index 629aaf72201d03318b0291ede89ceb78c6fc06df..ab27375c73828c505824ed70a637bef61256cce5 100644 --- a/cpp/src/metrics/SystemInfo.h +++ b/cpp/src/metrics/SystemInfo.h @@ -32,6 +32,7 @@ class SystemInfo { clock_t last_user_cpu_ = clock_t(); std::chrono::system_clock::time_point net_time_ = std::chrono::system_clock::now(); int num_processors_ = 0; + int num_physical_processors_ = 0; //number of GPU unsigned int num_device_ = 0; unsigned long long in_octets_ = 0; @@ -47,6 +48,7 @@ class SystemInfo { void Init(); int num_processor() const { return num_processors_;}; + int num_physical_processors() const { return num_physical_processors_; }; int num_device() const {return num_device_;}; unsigned long long get_inoctets() { return in_octets_;}; unsigned long long get_octets() { return out_octets_;}; @@ -65,7 +67,8 @@ class SystemInfo { std::vector CPUCorePercent(); std::vector getTotalCpuTime(std::vector &workTime); - + std::vector GPUTemperature(); + std::vector CPUTemperature(); };