PrometheusMetrics.cpp 7.4 KB
Newer Older
J
jinhai 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

Z
update  
Zhiru Zhu 已提交
18
#include "metrics/prometheus/PrometheusMetrics.h"
19
#include "cache/GpuCacheMgr.h"
20
#include "metrics/SystemInfo.h"
Z
format  
Zhiru Zhu 已提交
21
#include "server/Config.h"
G
groot 已提交
22
#include "utils/Log.h"
Y
yu yunfeng 已提交
23

S
starlord 已提交
24 25
#include <string>
#include <utility>
26

J
jinhai 已提交
27
namespace milvus {
Y
yu yunfeng 已提交
28 29
namespace server {

30
ErrorCode
Y
yu yunfeng 已提交
31
PrometheusMetrics::Init() {
G
groot 已提交
32
    try {
S
starlord 已提交
33
        Config& config = Config::GetInstance();
Y
yudong.cai 已提交
34
        Status s = config.GetMetricConfigEnableMonitor(startup_);
S
starlord 已提交
35 36 37 38 39 40
        if (!s.ok()) {
            return s.code();
        }
        if (!startup_) {
            return SERVER_SUCCESS;
        }
41

G
groot 已提交
42
        // Following should be read from config file.
43 44
        std::string bind_address;
        s = config.GetMetricConfigPrometheusPort(bind_address);
S
starlord 已提交
45 46 47 48
        if (!s.ok()) {
            return s.code();
        }

49
        const std::string uri = std::string("/metrics");
G
groot 已提交
50 51 52 53 54 55 56
        const std::size_t num_threads = 2;

        // Init Exposer
        exposer_ptr_ = std::make_shared<prometheus::Exposer>(bind_address, uri, num_threads);

        // Exposer Registry
        exposer_ptr_->RegisterCollectable(registry_);
S
starlord 已提交
57
    } catch (std::exception& ex) {
G
groot 已提交
58 59 60
        SERVER_LOG_ERROR << "Failed to connect prometheus server: " << std::string(ex.what());
        return SERVER_UNEXPECTED_ERROR;
    }
Y
yu yunfeng 已提交
61 62

    return SERVER_SUCCESS;
Y
yu yunfeng 已提交
63 64 65
}

void
S
starlord 已提交
66
PrometheusMetrics::CPUUsagePercentSet() {
S
starlord 已提交
67 68 69 70
    if (!startup_) {
        return;
    }

Y
yu yunfeng 已提交
71 72 73 74 75 76
    double usage_percent = server::SystemInfo::GetInstance().CPUPercent();
    CPU_usage_percent_.Set(usage_percent);
}

void
PrometheusMetrics::RAMUsagePercentSet() {
S
starlord 已提交
77 78 79 80
    if (!startup_) {
        return;
    }

Y
yu yunfeng 已提交
81 82 83 84 85 86
    double usage_percent = server::SystemInfo::GetInstance().MemoryPercent();
    RAM_usage_percent_.Set(usage_percent);
}

void
PrometheusMetrics::GPUPercentGaugeSet() {
S
starlord 已提交
87 88 89 90
    if (!startup_) {
        return;
    }

K
kun yu 已提交
91
    int numDevice = server::SystemInfo::GetInstance().num_device();
S
starlord 已提交
92 93
    std::vector<uint64_t> used_total = server::SystemInfo::GetInstance().GPUMemoryTotal();
    std::vector<uint64_t> used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed();
K
kun yu 已提交
94

K
kun yu 已提交
95
    for (int i = 0; i < numDevice; ++i) {
S
starlord 已提交
96 97
        prometheus::Gauge& GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}});
        double percent = (double)used_memory[i] / (double)used_total[i];
K
kun yu 已提交
98
        GPU_percent.Set(percent * 100);
K
kun yu 已提交
99
    }
Y
yu yunfeng 已提交
100 101
}

S
starlord 已提交
102 103
void
PrometheusMetrics::GPUMemoryUsageGaugeSet() {
S
starlord 已提交
104 105 106 107
    if (!startup_) {
        return;
    }

S
starlord 已提交
108 109
    std::vector<uint64_t> values = server::SystemInfo::GetInstance().GPUMemoryUsed();
    constexpr uint64_t MtoB = 1024 * 1024;
K
kun yu 已提交
110
    int numDevice = server::SystemInfo::GetInstance().num_device();
Y
yu yunfeng 已提交
111

K
kun yu 已提交
112
    for (int i = 0; i < numDevice; ++i) {
S
starlord 已提交
113
        prometheus::Gauge& GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}});
K
kun yu 已提交
114 115
        GPU_memory.Set(values[i] / MtoB);
    }
Y
yu yunfeng 已提交
116 117
}

S
starlord 已提交
118 119 120
void
PrometheusMetrics::AddVectorsPerSecondGaugeSet(int num_vector, int dim, double time) {
    // MB/s
S
starlord 已提交
121 122 123
    if (!startup_) {
        return;
    }
Y
yu yunfeng 已提交
124

S
starlord 已提交
125 126 127
    int64_t MtoB = 1024 * 1024;
    int64_t size = num_vector * dim * 4;
    add_vectors_per_second_gauge_.Set(size / time / MtoB);
Y
yu yunfeng 已提交
128
}
S
starlord 已提交
129 130 131

void
PrometheusMetrics::QueryIndexTypePerSecondSet(std::string type, double value) {
S
starlord 已提交
132 133 134 135
    if (!startup_) {
        return;
    }

S
starlord 已提交
136
    if (type == "IVF") {
Y
yu yunfeng 已提交
137
        query_index_IVF_type_per_second_gauge_.Set(value);
S
starlord 已提交
138
    } else if (type == "IDMap") {
Y
yu yunfeng 已提交
139 140 141
        query_index_IDMAP_type_per_second_gauge_.Set(value);
    }
}
Y
yu yunfeng 已提交
142

S
starlord 已提交
143 144
void
PrometheusMetrics::ConnectionGaugeIncrement() {
S
starlord 已提交
145 146 147 148
    if (!startup_) {
        return;
    }

Y
yu yunfeng 已提交
149 150
    connection_gauge_.Increment();
}
Y
yu yunfeng 已提交
151

S
starlord 已提交
152 153
void
PrometheusMetrics::ConnectionGaugeDecrement() {
S
starlord 已提交
154 155 156 157
    if (!startup_) {
        return;
    }

Y
yu yunfeng 已提交
158 159 160
    connection_gauge_.Decrement();
}

S
starlord 已提交
161 162
void
PrometheusMetrics::OctetsSet() {
S
starlord 已提交
163 164 165
    if (!startup_) {
        return;
    }
Y
yu yunfeng 已提交
166 167

    // get old stats and reset them
S
starlord 已提交
168 169
    uint64_t old_inoctets = SystemInfo::GetInstance().get_inoctets();
    uint64_t old_outoctets = SystemInfo::GetInstance().get_octets();
Y
yu yunfeng 已提交
170
    auto old_time = SystemInfo::GetInstance().get_nettime();
S
starlord 已提交
171
    std::pair<uint64_t, uint64_t> in_and_out_octets = SystemInfo::GetInstance().Octets();
Y
yu yunfeng 已提交
172 173 174 175 176
    SystemInfo::GetInstance().set_inoctets(in_and_out_octets.first);
    SystemInfo::GetInstance().set_outoctets(in_and_out_octets.second);
    SystemInfo::GetInstance().set_nettime();

    //
Y
yu yunfeng 已提交
177
    constexpr double micro_to_second = 1e-6;
Y
yu yunfeng 已提交
178 179
    auto now_time = std::chrono::system_clock::now();
    auto total_microsecond = METRICS_MICROSECONDS(old_time, now_time);
S
starlord 已提交
180
    auto total_second = total_microsecond * micro_to_second;
S
starlord 已提交
181 182 183 184
    if (total_second == 0) {
        return;
    }

S
starlord 已提交
185 186
    inoctets_gauge_.Set((in_and_out_octets.first - old_inoctets) / total_second);
    outoctets_gauge_.Set((in_and_out_octets.second - old_outoctets) / total_second);
Y
yu yunfeng 已提交
187 188
}

S
starlord 已提交
189 190
void
PrometheusMetrics::CPUCoreUsagePercentSet() {
S
starlord 已提交
191 192 193
    if (!startup_) {
        return;
    }
K
kun yu 已提交
194 195 196

    std::vector<double> cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent();

K
kun yu 已提交
197
    for (int i = 0; i < cpu_core_percent.size(); ++i) {
S
starlord 已提交
198
        prometheus::Gauge& core_percent = CPU_.Add({{"CPU", std::to_string(i)}});
K
kun yu 已提交
199 200 201
        core_percent.Set(cpu_core_percent[i]);
    }
}
Y
fix  
yu yunfeng 已提交
202

S
starlord 已提交
203 204
void
PrometheusMetrics::GPUTemperature() {
S
starlord 已提交
205 206 207
    if (!startup_) {
        return;
    }
K
kun yu 已提交
208

S
starlord 已提交
209
    std::vector<uint64_t> GPU_temperatures = server::SystemInfo::GetInstance().GPUTemperature();
K
kun yu 已提交
210 211

    for (int i = 0; i < GPU_temperatures.size(); ++i) {
S
starlord 已提交
212
        prometheus::Gauge& gpu_temp = GPU_temperature_.Add({{"GPU", std::to_string(i)}});
K
kun yu 已提交
213 214 215 216
        gpu_temp.Set(GPU_temperatures[i]);
    }
}

S
starlord 已提交
217 218
void
PrometheusMetrics::CPUTemperature() {
S
starlord 已提交
219 220 221
    if (!startup_) {
        return;
    }
K
kun yu 已提交
222 223 224

    std::vector<float> CPU_temperatures = server::SystemInfo::GetInstance().CPUTemperature();

225
    float avg_cpu_temp = 0;
K
kun yu 已提交
226
    for (int i = 0; i < CPU_temperatures.size(); ++i) {
227
        avg_cpu_temp += CPU_temperatures[i];
K
kun yu 已提交
228
    }
229 230 231 232 233
    avg_cpu_temp /= CPU_temperatures.size();

    prometheus::Gauge& cpu_temp = CPU_temperature_.Add({{"CPU", std::to_string(0)}});
    cpu_temp.Set(avg_cpu_temp);

234 235 236 237
    //    for (int i = 0; i < CPU_temperatures.size(); ++i) {
    //        prometheus::Gauge& cpu_temp = CPU_temperature_.Add({{"CPU", std::to_string(i)}});
    //        cpu_temp.Set(CPU_temperatures[i]);
    //    }
K
kun yu 已提交
238
}
Y
yu yunfeng 已提交
239

S
starlord 已提交
240 241
void
PrometheusMetrics::GpuCacheUsageGaugeSet() {
S
starlord 已提交
242 243 244 245 246 247 248
    //    std::vector<uint64_t > gpu_ids = {0};
    //    for(auto i = 0; i < gpu_ids.size(); ++i) {
    //        uint64_t cache_usage = cache::GpuCacheMgr::GetInstance(gpu_ids[i])->CacheUsage();
    //        uint64_t cache_capacity = cache::GpuCacheMgr::GetInstance(gpu_ids[i])->CacheCapacity();
    //        prometheus::Gauge &gpu_cache = gpu_cache_usage_.Add({{"GPU_Cache", std::to_string(i)}});
    //        gpu_cache.Set(cache_usage * 100 / cache_capacity);
    //    }
Y
Yu Kun 已提交
249 250
}

S
starlord 已提交
251 252
}  // namespace server
}  // namespace milvus