提交 e9bb2e77 编写于 作者: S starlord

MS-312 Set openmp thread number by config


Former-commit-id: 897d1c7387655046b4f769d695b9415785bd5108
...@@ -41,6 +41,8 @@ Please mark all change in change log and use the ticket from JIRA. ...@@ -41,6 +41,8 @@ Please mark all change in change log and use the ticket from JIRA.
- MS-275 - Avoid sqlite logic error excetion - MS-275 - Avoid sqlite logic error excetion
- MS-278 - add IndexStatsHelper - MS-278 - add IndexStatsHelper
- MS-312 - Set openmp thread number by config - MS-312 - Set openmp thread number by config
- MS-305 - add CPU core percent metric
- MS-310 - add milvus CPU utilization ratio and CPU/GPU temperature metrics
## New Feature ## New Feature
- MS-180 - Add new mem manager - MS-180 - Add new mem manager
......
...@@ -319,7 +319,8 @@ void DBImpl::StartMetricTask() { ...@@ -319,7 +319,8 @@ void DBImpl::StartMetricTask() {
server::Metrics::GetInstance().OctetsSet(); server::Metrics::GetInstance().OctetsSet();
server::Metrics::GetInstance().CPUCoreUsagePercentSet(); server::Metrics::GetInstance().CPUCoreUsagePercentSet();
server::Metrics::GetInstance().GPUTemperature();
server::Metrics::GetInstance().CPUTemperature();
ENGINE_LOG_TRACE << "Metric task finished"; ENGINE_LOG_TRACE << "Metric task finished";
} }
......
...@@ -66,6 +66,8 @@ class MetricsBase{ ...@@ -66,6 +66,8 @@ class MetricsBase{
virtual void OctetsSet() {}; virtual void OctetsSet() {};
virtual void CPUCoreUsagePercentSet() {}; virtual void CPUCoreUsagePercentSet() {};
virtual void GPUTemperature() {};
virtual void CPUTemperature() {};
}; };
......
...@@ -34,8 +34,6 @@ PrometheusMetrics::Init() { ...@@ -34,8 +34,6 @@ PrometheusMetrics::Init() {
return SERVER_UNEXPECTED_ERROR; return SERVER_UNEXPECTED_ERROR;
} }
//
return SERVER_SUCCESS; return SERVER_SUCCESS;
} }
...@@ -44,8 +42,6 @@ PrometheusMetrics::Init() { ...@@ -44,8 +42,6 @@ PrometheusMetrics::Init() {
void void
PrometheusMetrics::CPUUsagePercentSet() { PrometheusMetrics::CPUUsagePercentSet() {
if(!startup_) return ; if(!startup_) return ;
int numProcessor = server::SystemInfo::GetInstance().num_processor();
double usage_percent = server::SystemInfo::GetInstance().CPUPercent(); double usage_percent = server::SystemInfo::GetInstance().CPUPercent();
CPU_usage_percent_.Set(usage_percent); CPU_usage_percent_.Set(usage_percent);
} }
...@@ -64,13 +60,11 @@ PrometheusMetrics::GPUPercentGaugeSet() { ...@@ -64,13 +60,11 @@ PrometheusMetrics::GPUPercentGaugeSet() {
std::vector<unsigned long long > used_total = server::SystemInfo::GetInstance().GPUMemoryTotal(); std::vector<unsigned long long > used_total = server::SystemInfo::GetInstance().GPUMemoryTotal();
std::vector<unsigned long long > used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed(); std::vector<unsigned long long > used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed();
for (int i = 0; i < numDevice; ++i) {
for (int i = 0; i < numDevice; i++) {
prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}}); prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}});
double percent = (double)used_memory[i] / (double)used_total[i]; double percent = (double)used_memory[i] / (double)used_total[i];
GPU_percent.Set(percent * 100); GPU_percent.Set(percent * 100);
} }
} }
void PrometheusMetrics::GPUMemoryUsageGaugeSet() { void PrometheusMetrics::GPUMemoryUsageGaugeSet() {
...@@ -79,7 +73,7 @@ void PrometheusMetrics::GPUMemoryUsageGaugeSet() { ...@@ -79,7 +73,7 @@ void PrometheusMetrics::GPUMemoryUsageGaugeSet() {
constexpr unsigned long long MtoB = 1024*1024; constexpr unsigned long long MtoB = 1024*1024;
int numDevice = server::SystemInfo::GetInstance().num_device(); int numDevice = server::SystemInfo::GetInstance().num_device();
for (int i = 0; i < numDevice; i++) { for (int i = 0; i < numDevice; ++i) {
prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}}); prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}});
GPU_memory.Set(values[i] / MtoB); GPU_memory.Set(values[i] / MtoB);
} }
...@@ -142,12 +136,35 @@ void PrometheusMetrics::CPUCoreUsagePercentSet() { ...@@ -142,12 +136,35 @@ void PrometheusMetrics::CPUCoreUsagePercentSet() {
std::vector<double> cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent(); std::vector<double> cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent();
for (int i = 0; i < cpu_core_percent.size(); i++) { for (int i = 0; i < cpu_core_percent.size(); ++i) {
prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}}); prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}});
core_percent.Set(cpu_core_percent[i]); core_percent.Set(cpu_core_percent[i]);
} }
} }
void PrometheusMetrics::GPUTemperature() {
if (!startup_)
return;
std::vector<unsigned int> GPU_temperatures = server::SystemInfo::GetInstance().GPUTemperature();
for (int i = 0; i < GPU_temperatures.size(); ++i) {
prometheus::Gauge &gpu_temp = GPU_temperature_.Add({{"GPU", std::to_string(i)}});
gpu_temp.Set(GPU_temperatures[i]);
}
}
void PrometheusMetrics::CPUTemperature() {
if (!startup_)
return;
std::vector<float> CPU_temperatures = server::SystemInfo::GetInstance().CPUTemperature();
for (int i = 0; i < CPU_temperatures.size(); ++i) {
prometheus::Gauge &cpu_temp = CPU_temperature_.Add({{"CPU", std::to_string(i)}});
cpu_temp.Set(CPU_temperatures[i]);
}
}
} }
} }
......
...@@ -79,7 +79,6 @@ class PrometheusMetrics: public MetricsBase { ...@@ -79,7 +79,6 @@ class PrometheusMetrics: public MetricsBase {
void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);}; void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);};
void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);}; void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);};
void CPUUsagePercentSet() override ; void CPUUsagePercentSet() override ;
void CPUCoreUsagePercentSet() override; void CPUCoreUsagePercentSet() override;
void RAMUsagePercentSet() override ; void RAMUsagePercentSet() override ;
...@@ -93,6 +92,9 @@ class PrometheusMetrics: public MetricsBase { ...@@ -93,6 +92,9 @@ class PrometheusMetrics: public MetricsBase {
void KeepingAliveCounterIncrement(double value = 1) override {if(startup_) keeping_alive_counter_.Increment(value);}; void KeepingAliveCounterIncrement(double value = 1) override {if(startup_) keeping_alive_counter_.Increment(value);};
void OctetsSet() override ; void OctetsSet() override ;
void GPUTemperature() override;
void CPUTemperature() override;
...@@ -396,7 +398,7 @@ class PrometheusMetrics: public MetricsBase { ...@@ -396,7 +398,7 @@ class PrometheusMetrics: public MetricsBase {
.Name("CPU_usage_percent") .Name("CPU_usage_percent")
.Help("CPU usage percent by this this process") .Help("CPU usage percent by this this process")
.Register(*registry_); .Register(*registry_);
prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "0"}}); prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "avg"}});
prometheus::Family<prometheus::Gauge> &RAM_ = prometheus::BuildGauge() prometheus::Family<prometheus::Gauge> &RAM_ = prometheus::BuildGauge()
...@@ -444,6 +446,15 @@ class PrometheusMetrics: public MetricsBase { ...@@ -444,6 +446,15 @@ class PrometheusMetrics: public MetricsBase {
prometheus::Gauge &outoctets_gauge_ = octets_.Add({{"type", "outoctets"}}); prometheus::Gauge &outoctets_gauge_ = octets_.Add({{"type", "outoctets"}});
prometheus::Family<prometheus::Gauge> &GPU_temperature_ = prometheus::BuildGauge()
.Name("GPU_temperature")
.Help("GPU temperature")
.Register(*registry_);
prometheus::Family<prometheus::Gauge> &CPU_temperature_ = prometheus::BuildGauge()
.Name("CPU_temperature")
.Help("CPU temperature")
.Register(*registry_);
}; };
......
...@@ -36,6 +36,9 @@ void SystemInfo::Init() { ...@@ -36,6 +36,9 @@ void SystemInfo::Init() {
num_processors_ = 0; num_processors_ = 0;
while(fgets(line, 128, file) != NULL){ while(fgets(line, 128, file) != NULL){
if (strncmp(line, "processor", 9) == 0) num_processors_++; if (strncmp(line, "processor", 9) == 0) num_processors_++;
if (strncmp(line, "physical", 8) == 0) {
num_physical_processors_ = ParseLine(line);
}
} }
total_ram_ = GetPhysicalMemory(); total_ram_ = GetPhysicalMemory();
fclose(file); fclose(file);
...@@ -108,8 +111,6 @@ SystemInfo::MemoryPercent() { ...@@ -108,8 +111,6 @@ SystemInfo::MemoryPercent() {
return (double)(GetProcessUsedMemory()*100)/(double)total_ram_; return (double)(GetProcessUsedMemory()*100)/(double)total_ram_;
} }
std::vector<double> std::vector<double>
SystemInfo::CPUCorePercent() { SystemInfo::CPUCorePercent() {
std::vector<unsigned long long> prev_work_time_array; std::vector<unsigned long long> prev_work_time_array;
...@@ -119,7 +120,7 @@ SystemInfo::CPUCorePercent() { ...@@ -119,7 +120,7 @@ SystemInfo::CPUCorePercent() {
std::vector<unsigned long long> cur_total_time_array = getTotalCpuTime(cur_work_time_array); std::vector<unsigned long long> cur_total_time_array = getTotalCpuTime(cur_work_time_array);
std::vector<double> cpu_core_percent; std::vector<double> cpu_core_percent;
for (int i = 0; i < num_processors_; i++) { for (int i = 1; i < num_processors_; i++) {
double total_cpu_time = cur_total_time_array[i] - prev_total_time_array[i]; double total_cpu_time = cur_total_time_array[i] - prev_total_time_array[i];
double cpu_work_time = cur_work_time_array[i] - prev_work_time_array[i]; double cpu_work_time = cur_work_time_array[i] - prev_work_time_array[i];
cpu_core_percent.push_back((cpu_work_time / total_cpu_time) * 100); cpu_core_percent.push_back((cpu_work_time / total_cpu_time) * 100);
...@@ -181,7 +182,6 @@ SystemInfo::CPUPercent() { ...@@ -181,7 +182,6 @@ SystemInfo::CPUPercent() {
percent = (time_sample.tms_stime - last_sys_cpu_) + percent = (time_sample.tms_stime - last_sys_cpu_) +
(time_sample.tms_utime - last_user_cpu_); (time_sample.tms_utime - last_user_cpu_);
percent /= (now - last_cpu_); percent /= (now - last_cpu_);
percent /= num_processors_;
percent *= 100; percent *= 100;
} }
last_cpu_ = now; last_cpu_ = now;
...@@ -207,6 +207,36 @@ SystemInfo::GPUMemoryTotal() { ...@@ -207,6 +207,36 @@ SystemInfo::GPUMemoryTotal() {
return result; return result;
} }
std::vector<unsigned int>
SystemInfo::GPUTemperature(){
if(!initialized_) Init();
std::vector<unsigned int > result;
for (int i = 0; i < num_device_; i++) {
nvmlDevice_t device;
nvmlDeviceGetHandleByIndex(i, &device);
unsigned int temp;
nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU,&temp);
result.push_back(temp);
}
return result;
}
std::vector<float>
SystemInfo::CPUTemperature(){
std::vector<float> result;
for (int i = 0; i <= num_physical_processors_; ++i) {
std::string path = "/sys/class/thermal/thermal_zone" + std::to_string(i) + "/temp";
FILE *file = fopen(path.data(), "r");
if (file == NULL) {
perror("Could not open thermal file");
return result;
}
float temp;
fscanf(file, "%f", &temp);
result.push_back(temp / 1000);
}
}
std::vector<unsigned long long> std::vector<unsigned long long>
SystemInfo::GPUMemoryUsed() { SystemInfo::GPUMemoryUsed() {
// get GPU memory used // get GPU memory used
......
...@@ -32,6 +32,7 @@ class SystemInfo { ...@@ -32,6 +32,7 @@ class SystemInfo {
clock_t last_user_cpu_ = clock_t(); clock_t last_user_cpu_ = clock_t();
std::chrono::system_clock::time_point net_time_ = std::chrono::system_clock::now(); std::chrono::system_clock::time_point net_time_ = std::chrono::system_clock::now();
int num_processors_ = 0; int num_processors_ = 0;
int num_physical_processors_ = 0;
//number of GPU //number of GPU
unsigned int num_device_ = 0; unsigned int num_device_ = 0;
unsigned long long in_octets_ = 0; unsigned long long in_octets_ = 0;
...@@ -47,6 +48,7 @@ class SystemInfo { ...@@ -47,6 +48,7 @@ class SystemInfo {
void Init(); void Init();
int num_processor() const { return num_processors_;}; int num_processor() const { return num_processors_;};
int num_physical_processors() const { return num_physical_processors_; };
int num_device() const {return num_device_;}; int num_device() const {return num_device_;};
unsigned long long get_inoctets() { return in_octets_;}; unsigned long long get_inoctets() { return in_octets_;};
unsigned long long get_octets() { return out_octets_;}; unsigned long long get_octets() { return out_octets_;};
...@@ -65,7 +67,8 @@ class SystemInfo { ...@@ -65,7 +67,8 @@ class SystemInfo {
std::vector<double> CPUCorePercent(); std::vector<double> CPUCorePercent();
std::vector<unsigned long long> getTotalCpuTime(std::vector<unsigned long long> &workTime); std::vector<unsigned long long> getTotalCpuTime(std::vector<unsigned long long> &workTime);
std::vector<unsigned int> GPUTemperature();
std::vector<float> CPUTemperature();
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册