PrometheusMetrics.cpp 6.8 KB
Newer Older
J
jinhai 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

Y
yu yunfeng 已提交
18

19
#include "cache/GpuCacheMgr.h"
Y
yu yunfeng 已提交
20
#include "PrometheusMetrics.h"
21
#include "server/Config.h"
G
groot 已提交
22
#include "utils/Log.h"
Y
yu yunfeng 已提交
23
#include "SystemInfo.h"
Y
yu yunfeng 已提交
24

25

Y
yu yunfeng 已提交
26
namespace zilliz {
J
jinhai 已提交
27
namespace milvus {
Y
yu yunfeng 已提交
28 29
namespace server {

30
ErrorCode
Y
yu yunfeng 已提交
31
PrometheusMetrics::Init() {
G
groot 已提交
32
    try {
33
        Config &config = Config::GetInstance();
34 35
        Status s = config.GetMetricConfigAutoBootup(startup_);
        if (!s.ok()) return s.code();
36 37
        if (!startup_) return SERVER_SUCCESS;

G
groot 已提交
38
        // Following should be read from config file.
39 40 41
        std::string bind_address;
        s = config.GetMetricConfigPrometheusPort(bind_address);
        if (!s.ok()) return s.code();
42
        const std::string uri = std::string("/tmp/metrics");
G
groot 已提交
43 44 45 46 47 48 49 50 51 52 53
        const std::size_t num_threads = 2;

        // Init Exposer
        exposer_ptr_ = std::make_shared<prometheus::Exposer>(bind_address, uri, num_threads);

        // Exposer Registry
        exposer_ptr_->RegisterCollectable(registry_);
    } catch (std::exception& ex) {
        SERVER_LOG_ERROR << "Failed to connect prometheus server: " << std::string(ex.what());
        return SERVER_UNEXPECTED_ERROR;
    }
Y
yu yunfeng 已提交
54 55

    return SERVER_SUCCESS;
Y
yu yunfeng 已提交
56 57 58

}

Y
fix  
yu yunfeng 已提交
59

Y
yu yunfeng 已提交
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
void
PrometheusMetrics::CPUUsagePercentSet()  {
    if(!startup_) return ;
    double usage_percent = server::SystemInfo::GetInstance().CPUPercent();
    CPU_usage_percent_.Set(usage_percent);
}

void
PrometheusMetrics::RAMUsagePercentSet() {
    if(!startup_) return ;
    double usage_percent = server::SystemInfo::GetInstance().MemoryPercent();
    RAM_usage_percent_.Set(usage_percent);
}

void
PrometheusMetrics::GPUPercentGaugeSet() {
    if(!startup_) return;
K
kun yu 已提交
77
    int numDevice = server::SystemInfo::GetInstance().num_device();
K
kun yu 已提交
78
    std::vector<unsigned long long > used_total = server::SystemInfo::GetInstance().GPUMemoryTotal();
K
kun yu 已提交
79 80
    std::vector<unsigned long long > used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed();

K
kun yu 已提交
81
    for (int i = 0; i < numDevice; ++i) {
K
kun yu 已提交
82
        prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}});
K
kun yu 已提交
83 84
        double percent = (double)used_memory[i] / (double)used_total[i];
        GPU_percent.Set(percent * 100);
K
kun yu 已提交
85
    }
Y
yu yunfeng 已提交
86 87 88 89 90
}

void PrometheusMetrics::GPUMemoryUsageGaugeSet() {
    if(!startup_) return;
    std::vector<unsigned long long> values = server::SystemInfo::GetInstance().GPUMemoryUsed();
Y
fix  
yu yunfeng 已提交
91
    constexpr unsigned long long MtoB = 1024*1024;
K
kun yu 已提交
92
    int numDevice = server::SystemInfo::GetInstance().num_device();
Y
yu yunfeng 已提交
93

K
kun yu 已提交
94
    for (int i = 0; i < numDevice; ++i) {
K
kun yu 已提交
95 96 97 98
        prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}});
        GPU_memory.Set(values[i] / MtoB);
    }

Y
yu yunfeng 已提交
99 100 101 102 103 104 105 106 107
}
void PrometheusMetrics::AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) {
    // MB/s
    if(!startup_) return;

    long long MtoB = 1024*1024;
    long long size = num_vector * dim * 4;
    add_vectors_per_second_gauge_.Set(size/time/MtoB);

Y
yu yunfeng 已提交
108
}
Y
yu yunfeng 已提交
109
void PrometheusMetrics::QueryIndexTypePerSecondSet(std::string type, double value) {
Y
yu yunfeng 已提交
110
    if(!startup_) return;
Y
yu yunfeng 已提交
111 112 113 114 115 116 117
    if(type == "IVF"){
        query_index_IVF_type_per_second_gauge_.Set(value);
    } else if(type == "IDMap"){
        query_index_IDMAP_type_per_second_gauge_.Set(value);
    }

}
Y
yu yunfeng 已提交
118

Y
yu yunfeng 已提交
119 120 121 122
void PrometheusMetrics::ConnectionGaugeIncrement() {
    if(!startup_) return;
    connection_gauge_.Increment();
}
Y
yu yunfeng 已提交
123

Y
yu yunfeng 已提交
124 125 126 127 128
void PrometheusMetrics::ConnectionGaugeDecrement() {
    if(!startup_) return;
    connection_gauge_.Decrement();
}

Y
yu yunfeng 已提交
129 130 131 132
void PrometheusMetrics::OctetsSet() {
    if(!startup_) return;

    // get old stats and reset them
Y
yu yunfeng 已提交
133 134 135
    unsigned long long old_inoctets = SystemInfo::GetInstance().get_inoctets();
    unsigned long long old_outoctets = SystemInfo::GetInstance().get_octets();
    auto old_time = SystemInfo::GetInstance().get_nettime();
Y
yu yunfeng 已提交
136 137 138 139 140 141
    std::pair<unsigned long long, unsigned long long> in_and_out_octets = SystemInfo::GetInstance().Octets();
    SystemInfo::GetInstance().set_inoctets(in_and_out_octets.first);
    SystemInfo::GetInstance().set_outoctets(in_and_out_octets.second);
    SystemInfo::GetInstance().set_nettime();

    //
Y
yu yunfeng 已提交
142
    constexpr double micro_to_second = 1e-6;
Y
yu yunfeng 已提交
143 144 145 146 147 148 149 150
    auto now_time = std::chrono::system_clock::now();
    auto total_microsecond = METRICS_MICROSECONDS(old_time, now_time);
    auto total_second = total_microsecond*micro_to_second;
    if(total_second == 0) return;
    inoctets_gauge_.Set((in_and_out_octets.first-old_inoctets)/total_second);
    outoctets_gauge_.Set((in_and_out_octets.second-old_outoctets)/total_second);
}

K
kun yu 已提交
151 152 153 154 155 156
void PrometheusMetrics::CPUCoreUsagePercentSet() {
    if (!startup_)
        return;

    std::vector<double> cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent();

K
kun yu 已提交
157
    for (int i = 0; i < cpu_core_percent.size(); ++i) {
K
kun yu 已提交
158 159 160 161
        prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}});
        core_percent.Set(cpu_core_percent[i]);
    }
}
Y
fix  
yu yunfeng 已提交
162

K
kun yu 已提交
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
void PrometheusMetrics::GPUTemperature() {
    if (!startup_)
        return;

    std::vector<unsigned int> GPU_temperatures = server::SystemInfo::GetInstance().GPUTemperature();

    for (int i = 0; i < GPU_temperatures.size(); ++i) {
        prometheus::Gauge &gpu_temp = GPU_temperature_.Add({{"GPU", std::to_string(i)}});
        gpu_temp.Set(GPU_temperatures[i]);
    }
}

void PrometheusMetrics::CPUTemperature() {
    if (!startup_)
        return;

    std::vector<float> CPU_temperatures = server::SystemInfo::GetInstance().CPUTemperature();

    for (int i = 0; i < CPU_temperatures.size(); ++i) {
        prometheus::Gauge &cpu_temp = CPU_temperature_.Add({{"CPU", std::to_string(i)}});
        cpu_temp.Set(CPU_temperatures[i]);
    }
}
Y
yu yunfeng 已提交
186

Y
Yu Kun 已提交
187
void PrometheusMetrics::GpuCacheUsageGaugeSet() {
S
starlord 已提交
188 189 190 191 192 193 194
//    std::vector<uint64_t > gpu_ids = {0};
//    for(auto i = 0; i < gpu_ids.size(); ++i) {
//        uint64_t cache_usage = cache::GpuCacheMgr::GetInstance(gpu_ids[i])->CacheUsage();
//        uint64_t cache_capacity = cache::GpuCacheMgr::GetInstance(gpu_ids[i])->CacheCapacity();
//        prometheus::Gauge &gpu_cache = gpu_cache_usage_.Add({{"GPU_Cache", std::to_string(i)}});
//        gpu_cache.Set(cache_usage * 100 / cache_capacity);
//    }
Y
Yu Kun 已提交
195 196
}

Y
yu yunfeng 已提交
197 198 199
}
}
}