提交 11336a2f 编写于 作者: Z zhiru

Merge remote-tracking branch 'upstream/branch-0.4.0' into branch-0.4.0


Former-commit-id: d328c6d9914a9cdab818f1a8b604b17867f1b9f5
此差异已折叠。
......@@ -325,6 +325,9 @@ void DBImpl::StartMetricTask() {
server::Metrics::GetInstance().GPUMemoryUsageGaugeSet();
server::Metrics::GetInstance().OctetsSet();
server::Metrics::GetInstance().CPUCoreUsagePercentSet();
ENGINE_LOG_TRACE << "Metric task finished";
}
......@@ -562,15 +565,25 @@ Status DBImpl::BuildIndex(const meta::TableFileSchema& file) {
auto to_remove = file;
to_remove.file_type_ = meta::TableFileSchema::TO_DELETE;
meta::TableFilesSchema update_files = {to_remove, table_file};
meta_ptr_->UpdateTableFiles(update_files);
meta::TableFilesSchema update_files = {table_file, to_remove};
status = meta_ptr_->UpdateTableFiles(update_files);
if(status.ok()) {
ENGINE_LOG_DEBUG << "New index file " << table_file.file_id_ << " of size "
<< index->PhysicalSize() << " bytes"
<< " from file " << to_remove.file_id_;
ENGINE_LOG_DEBUG << "New index file " << table_file.file_id_ << " of size "
<< index->PhysicalSize() << " bytes"
<< " from file " << to_remove.file_id_;
if(options_.insert_cache_immediately_) {
index->Cache();
if(options_.insert_cache_immediately_) {
index->Cache();
}
} else {
//failed to update meta, mark the new file as to_delete, don't delete old file
to_remove.file_type_ = meta::TableFileSchema::TO_INDEX;
status = meta_ptr_->UpdateTableFile(to_remove);
ENGINE_LOG_DEBUG << "Failed to update file to index, mark file: " << to_remove.file_id_ << " to to_index";
table_file.file_type_ = meta::TableFileSchema::TO_DELETE;
status = meta_ptr_->UpdateTableFile(table_file);
ENGINE_LOG_DEBUG << "Failed to update file to index, mark file: " << table_file.file_id_ << " to to_delete";
}
} catch (std::exception& ex) {
......
......@@ -64,6 +64,8 @@ class MetricsBase{
virtual void ConnectionGaugeDecrement() {};
virtual void KeepingAliveCounterIncrement(double value = 1) {};
virtual void OctetsSet() {};
virtual void CPUCoreUsagePercentSet() {};
};
......
......@@ -44,6 +44,8 @@ PrometheusMetrics::Init() {
void
PrometheusMetrics::CPUUsagePercentSet() {
if(!startup_) return ;
int numProcessor = server::SystemInfo::GetInstance().num_processor();
double usage_percent = server::SystemInfo::GetInstance().CPUPercent();
CPU_usage_percent_.Set(usage_percent);
}
......@@ -58,36 +60,30 @@ PrometheusMetrics::RAMUsagePercentSet() {
void
PrometheusMetrics::GPUPercentGaugeSet() {
if(!startup_) return;
int numDevide = server::SystemInfo::GetInstance().num_device();
std::vector<unsigned int> values = server::SystemInfo::GetInstance().GPUPercent();
if(numDevide >= 1) GPU0_percent_gauge_.Set(static_cast<double>(values[0]));
if(numDevide >= 2) GPU1_percent_gauge_.Set(static_cast<double>(values[1]));
if(numDevide >= 3) GPU2_percent_gauge_.Set(static_cast<double>(values[2]));
if(numDevide >= 4) GPU3_percent_gauge_.Set(static_cast<double>(values[3]));
if(numDevide >= 5) GPU4_percent_gauge_.Set(static_cast<double>(values[4]));
if(numDevide >= 6) GPU5_percent_gauge_.Set(static_cast<double>(values[5]));
if(numDevide >= 7) GPU6_percent_gauge_.Set(static_cast<double>(values[6]));
if(numDevide >= 8) GPU7_percent_gauge_.Set(static_cast<double>(values[7]));
// to do
int numDevice = server::SystemInfo::GetInstance().num_device();
std::vector<unsigned long long > used_total = server::SystemInfo::GetInstance().GPUMemoryTotal();
std::vector<unsigned long long > used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed();
for (int i = 0; i < numDevice; i++) {
prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}});
double percent = (double)used_memory[i] / (double)used_total[i];
GPU_percent.Set(percent * 100);
}
}
void PrometheusMetrics::GPUMemoryUsageGaugeSet() {
if(!startup_) return;
std::vector<unsigned long long> values = server::SystemInfo::GetInstance().GPUMemoryUsed();
constexpr unsigned long long MtoB = 1024*1024;
int numDevice = values.size();
if(numDevice >=1) GPU0_memory_usage_gauge_.Set(values[0]/MtoB);
if(numDevice >=2) GPU1_memory_usage_gauge_.Set(values[1]/MtoB);
if(numDevice >=3) GPU2_memory_usage_gauge_.Set(values[2]/MtoB);
if(numDevice >=4) GPU3_memory_usage_gauge_.Set(values[3]/MtoB);
if(numDevice >=5) GPU4_memory_usage_gauge_.Set(values[4]/MtoB);
if(numDevice >=6) GPU5_memory_usage_gauge_.Set(values[5]/MtoB);
if(numDevice >=7) GPU6_memory_usage_gauge_.Set(values[6]/MtoB);
if(numDevice >=8) GPU7_memory_usage_gauge_.Set(values[7]/MtoB);
// to do
int numDevice = server::SystemInfo::GetInstance().num_device();
for (int i = 0; i < numDevice; i++) {
prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}});
GPU_memory.Set(values[i] / MtoB);
}
}
void PrometheusMetrics::AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) {
// MB/s
......@@ -140,6 +136,17 @@ void PrometheusMetrics::OctetsSet() {
outoctets_gauge_.Set((in_and_out_octets.second-old_outoctets)/total_second);
}
void PrometheusMetrics::CPUCoreUsagePercentSet() {
if (!startup_)
return;
std::vector<double> cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent();
for (int i = 0; i < cpu_core_percent.size(); i++) {
prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}});
core_percent.Set(cpu_core_percent[i]);
}
}
}
......
......@@ -12,6 +12,7 @@
#include <prometheus/registry.h>
#include <prometheus/exposer.h>
#include <iostream>
#include "server/ServerConfig.h"
#include "MetricBase.h"
......@@ -78,6 +79,9 @@ class PrometheusMetrics: public MetricsBase {
void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);};
void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);};
void CPUUsagePercentSet() override ;
void CPUCoreUsagePercentSet() override;
void RAMUsagePercentSet() override ;
void QueryResponsePerSecondGaugeSet(double value) override {if(startup_) query_response_per_second_gauge.Set(value);};
void GPUPercentGaugeSet() override ;
......@@ -322,7 +326,7 @@ class PrometheusMetrics: public MetricsBase {
prometheus::Gauge &faiss_disk_load_IO_speed_gauge_ = faiss_disk_load_IO_speed_.Add({{"DB","Faiss"}});
////all from CacheMgr.cpp
////all from CacheMgr.cpp
//record cache access count
prometheus::Family<prometheus::Counter> &cache_access_ = prometheus::BuildCounter()
.Name("cache_access_total")
......@@ -392,7 +396,8 @@ class PrometheusMetrics: public MetricsBase {
.Name("CPU_usage_percent")
.Help("CPU usage percent by this this process")
.Register(*registry_);
prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({});
prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "0"}});
prometheus::Family<prometheus::Gauge> &RAM_ = prometheus::BuildGauge()
.Name("RAM_usage_percent")
......@@ -405,33 +410,12 @@ class PrometheusMetrics: public MetricsBase {
.Name("Gpu_usage_percent")
.Help("GPU_usage_percent ")
.Register(*registry_);
prometheus::Gauge &GPU0_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "0"}});
prometheus::Gauge &GPU1_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "1"}});
prometheus::Gauge &GPU2_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "2"}});
prometheus::Gauge &GPU3_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "3"}});
prometheus::Gauge &GPU4_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "4"}});
prometheus::Gauge &GPU5_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "5"}});
prometheus::Gauge &GPU6_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "6"}});
prometheus::Gauge &GPU7_percent_gauge_ = GPU_percent_.Add({{"DeviceNum", "7"}});
// std::vector<prometheus::Gauge> GPU_percent_gauges_;
//GPU Mempry used
prometheus::Family<prometheus::Gauge> &GPU_memory_usage_ = prometheus::BuildGauge()
.Name("GPU_memory_usage_total")
.Help("GPU memory usage total ")
.Register(*registry_);
prometheus::Gauge &GPU0_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "0"}});
prometheus::Gauge &GPU1_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "1"}});
prometheus::Gauge &GPU2_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "2"}});
prometheus::Gauge &GPU3_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "3"}});
prometheus::Gauge &GPU4_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "4"}});
prometheus::Gauge &GPU5_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "5"}});
prometheus::Gauge &GPU6_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "6"}});
prometheus::Gauge &GPU7_memory_usage_gauge_ = GPU_memory_usage_.Add({{"DeviceNum", "7"}});
// std::vector<prometheus::Gauge> GPU_memory_usage_gauges_;
prometheus::Family<prometheus::Gauge> &query_index_type_per_second_ = prometheus::BuildGauge()
.Name("query_index_throughtout_per_microsecond")
......
......@@ -105,9 +105,65 @@ SystemInfo::GetProcessUsedMemory() {
double
SystemInfo::MemoryPercent() {
if (!initialized_) Init();
return GetProcessUsedMemory()*100/total_ram_;
return (double)(GetProcessUsedMemory()*100)/(double)total_ram_;
}
std::vector<double>
SystemInfo::CPUCorePercent() {
std::vector<unsigned long long> prev_work_time_array;
std::vector<unsigned long long> prev_total_time_array = getTotalCpuTime(prev_work_time_array);
usleep(100000);
std::vector<unsigned long long> cur_work_time_array;
std::vector<unsigned long long> cur_total_time_array = getTotalCpuTime(cur_work_time_array);
std::vector<double> cpu_core_percent;
for (int i = 0; i < num_processors_; i++) {
double total_cpu_time = cur_total_time_array[i] - prev_total_time_array[i];
double cpu_work_time = cur_work_time_array[i] - prev_work_time_array[i];
cpu_core_percent.push_back((cpu_work_time / total_cpu_time) * 100);
}
return cpu_core_percent;
}
std::vector<unsigned long long>
SystemInfo::getTotalCpuTime(std::vector<unsigned long long> &work_time_array)
{
std::vector<unsigned long long> total_time_array;
FILE* file = fopen("/proc/stat", "r");
if (file == NULL) {
perror("Could not open stat file");
return total_time_array;
}
unsigned long long user = 0, nice = 0, system = 0, idle = 0;
unsigned long long iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guestnice = 0;
for (int i = 0; i < num_processors_; i++) {
char buffer[1024];
char* ret = fgets(buffer, sizeof(buffer) - 1, file);
if (ret == NULL) {
perror("Could not read stat file");
fclose(file);
return total_time_array;
}
sscanf(buffer,
"cpu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu %16llu",
&user, &nice, &system, &idle, &iowait, &irq, &softirq, &steal, &guest, &guestnice);
work_time_array.push_back(user + nice + system);
total_time_array.push_back(user + nice + system + idle + iowait + irq + softirq + steal);
}
fclose(file);
return total_time_array;
}
double
SystemInfo::CPUPercent() {
if (!initialized_) Init();
......@@ -136,17 +192,17 @@ SystemInfo::CPUPercent() {
}
std::vector<unsigned int>
SystemInfo::GPUPercent() {
std::vector<unsigned long long>
SystemInfo::GPUMemoryTotal() {
// get GPU usage percent
if(!initialized_) Init();
std::vector<unsigned int> result;
nvmlUtilization_t utilization;
std::vector<unsigned long long > result;
nvmlMemory_t nvmlMemory;
for (int i = 0; i < num_device_; ++i) {
nvmlDevice_t device;
nvmlDeviceGetHandleByIndex(i, &device);
nvmlDeviceGetUtilizationRates(device, &utilization);
result.push_back(utilization.gpu);
nvmlDeviceGetMemoryInfo(device, &nvmlMemory);
result.push_back(nvmlMemory.total);
}
return result;
}
......
......@@ -46,6 +46,7 @@ class SystemInfo {
}
void Init();
int num_processor() const { return num_processors_;};
int num_device() const {return num_device_;};
unsigned long long get_inoctets() { return in_octets_;};
unsigned long long get_octets() { return out_octets_;};
......@@ -59,9 +60,13 @@ class SystemInfo {
double MemoryPercent();
double CPUPercent();
std::pair<unsigned long long , unsigned long long > Octets();
std::vector<unsigned int> GPUPercent();
std::vector<unsigned long long> GPUMemoryTotal();
std::vector<unsigned long long> GPUMemoryUsed();
std::vector<double> CPUCorePercent();
std::vector<unsigned long long> getTotalCpuTime(std::vector<unsigned long long> &workTime);
};
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册