// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #include "metrics/prometheus/PrometheusMetrics.h" #include "cache/GpuCacheMgr.h" #include "metrics/SystemInfo.h" #include "server/Config.h" #include "utils/Log.h" #include #include namespace milvus { namespace server { ErrorCode PrometheusMetrics::Init() { try { Config& config = Config::GetInstance(); Status s = config.GetMetricConfigEnableMonitor(startup_); if (!s.ok()) { return s.code(); } if (!startup_) { return SERVER_SUCCESS; } // Following should be read from config file. std::string bind_address; s = config.GetMetricConfigPrometheusPort(bind_address); if (!s.ok()) { return s.code(); } const std::string uri = std::string("/metrics"); const std::size_t num_threads = 2; // Init Exposer exposer_ptr_ = std::make_shared(bind_address, uri, num_threads); // Exposer Registry exposer_ptr_->RegisterCollectable(registry_); } catch (std::exception& ex) { SERVER_LOG_ERROR << "Failed to connect prometheus server: " << std::string(ex.what()); return SERVER_UNEXPECTED_ERROR; } return SERVER_SUCCESS; } void PrometheusMetrics::CPUUsagePercentSet() { if (!startup_) { return; } double usage_percent = server::SystemInfo::GetInstance().CPUPercent(); CPU_usage_percent_.Set(usage_percent); } void PrometheusMetrics::RAMUsagePercentSet() { if (!startup_) { return; } double usage_percent = server::SystemInfo::GetInstance().MemoryPercent(); RAM_usage_percent_.Set(usage_percent); } void PrometheusMetrics::GPUPercentGaugeSet() { if (!startup_) { return; } int numDevice = server::SystemInfo::GetInstance().num_device(); std::vector used_total = server::SystemInfo::GetInstance().GPUMemoryTotal(); std::vector used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed(); for (int i = 0; i < numDevice; ++i) { prometheus::Gauge& GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}}); double percent = (double)used_memory[i] / (double)used_total[i]; GPU_percent.Set(percent * 100); } } void PrometheusMetrics::GPUMemoryUsageGaugeSet() { if (!startup_) { return; } std::vector values = server::SystemInfo::GetInstance().GPUMemoryUsed(); constexpr uint64_t MtoB = 1024 * 1024; int numDevice = server::SystemInfo::GetInstance().num_device(); for (int i = 0; i < numDevice; ++i) { prometheus::Gauge& GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}}); GPU_memory.Set(values[i] / MtoB); } } void PrometheusMetrics::AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) { // MB/s if (!startup_) { return; } int64_t MtoB = 1024 * 1024; int64_t size = num_vector * dim * 4; add_vectors_per_second_gauge_.Set(size / time / MtoB); } void PrometheusMetrics::QueryIndexTypePerSecondSet(std::string type, double value) { if (!startup_) { return; } if (type == "IVF") { query_index_IVF_type_per_second_gauge_.Set(value); } else if (type == "IDMap") { query_index_IDMAP_type_per_second_gauge_.Set(value); } } void PrometheusMetrics::ConnectionGaugeIncrement() { if (!startup_) { return; } connection_gauge_.Increment(); } void PrometheusMetrics::ConnectionGaugeDecrement() { if (!startup_) { return; } connection_gauge_.Decrement(); } void PrometheusMetrics::OctetsSet() { if (!startup_) { return; } // get old stats and reset them uint64_t old_inoctets = SystemInfo::GetInstance().get_inoctets(); uint64_t old_outoctets = SystemInfo::GetInstance().get_octets(); auto old_time = SystemInfo::GetInstance().get_nettime(); std::pair in_and_out_octets = SystemInfo::GetInstance().Octets(); SystemInfo::GetInstance().set_inoctets(in_and_out_octets.first); SystemInfo::GetInstance().set_outoctets(in_and_out_octets.second); SystemInfo::GetInstance().set_nettime(); // constexpr double micro_to_second = 1e-6; auto now_time = std::chrono::system_clock::now(); auto total_microsecond = METRICS_MICROSECONDS(old_time, now_time); auto total_second = total_microsecond * micro_to_second; if (total_second == 0) { return; } inoctets_gauge_.Set((in_and_out_octets.first - old_inoctets) / total_second); outoctets_gauge_.Set((in_and_out_octets.second - old_outoctets) / total_second); } void PrometheusMetrics::CPUCoreUsagePercentSet() { if (!startup_) { return; } std::vector cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent(); for (int i = 0; i < cpu_core_percent.size(); ++i) { prometheus::Gauge& core_percent = CPU_.Add({{"CPU", std::to_string(i)}}); core_percent.Set(cpu_core_percent[i]); } } void PrometheusMetrics::GPUTemperature() { if (!startup_) { return; } std::vector GPU_temperatures = server::SystemInfo::GetInstance().GPUTemperature(); for (int i = 0; i < GPU_temperatures.size(); ++i) { prometheus::Gauge& gpu_temp = GPU_temperature_.Add({{"GPU", std::to_string(i)}}); gpu_temp.Set(GPU_temperatures[i]); } } void PrometheusMetrics::CPUTemperature() { if (!startup_) { return; } std::vector CPU_temperatures = server::SystemInfo::GetInstance().CPUTemperature(); float avg_cpu_temp = 0; for (int i = 0; i < CPU_temperatures.size(); ++i) { avg_cpu_temp += CPU_temperatures[i]; } avg_cpu_temp /= CPU_temperatures.size(); prometheus::Gauge& cpu_temp = CPU_temperature_.Add({{"CPU", std::to_string(0)}}); cpu_temp.Set(avg_cpu_temp); // for (int i = 0; i < CPU_temperatures.size(); ++i) { // prometheus::Gauge& cpu_temp = CPU_temperature_.Add({{"CPU", std::to_string(i)}}); // cpu_temp.Set(CPU_temperatures[i]); // } } void PrometheusMetrics::GpuCacheUsageGaugeSet() { // std::vector gpu_ids = {0}; // for(auto i = 0; i < gpu_ids.size(); ++i) { // uint64_t cache_usage = cache::GpuCacheMgr::GetInstance(gpu_ids[i])->CacheUsage(); // uint64_t cache_capacity = cache::GpuCacheMgr::GetInstance(gpu_ids[i])->CacheCapacity(); // prometheus::Gauge &gpu_cache = gpu_cache_usage_.Add({{"GPU_Cache", std::to_string(i)}}); // gpu_cache.Set(cache_usage * 100 / cache_capacity); // } } } // namespace server } // namespace milvus