SystemInfo.cpp 12.9 KB
Newer Older
1
// Copyright (C) 2019-2020 Zilliz. All rights reserved.
J
jinhai 已提交
2
//
3 4
// Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
J
jinhai 已提交
5
//
6 7 8 9 10
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software distributed under the License
// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
// or implied. See the License for the specific language governing permissions and limitations under the License.
J
jinhai 已提交
11

S
starlord 已提交
12
#include "metrics/SystemInfo.h"
13
#include "thirdparty/nlohmann/json.hpp"
G
groot 已提交
14
#include "utils/Exception.h"
H
Heisenberg 已提交
15
#include "utils/Log.h"
Y
yu yunfeng 已提交
16

S
starlord 已提交
17
#include <dirent.h>
18
#include <fiu/fiu-local.h>
19 20
#include <sys/sysinfo.h>
#include <sys/times.h>
Y
yu yunfeng 已提交
21
#include <unistd.h>
22
#include <map>
Y
yu yunfeng 已提交
23

Y
youny626 已提交
24
#ifdef MILVUS_GPU_VERSION
S
shengjh 已提交
25

Y
youny626 已提交
26
#include <nvml.h>
S
shengjh 已提交
27

Y
youny626 已提交
28 29
#endif

J
jinhai 已提交
30
namespace milvus {
Y
yu yunfeng 已提交
31 32
namespace server {

S
starlord 已提交
33 34
void
SystemInfo::Init() {
S
starlord 已提交
35
    if (initialized_) {
S
starlord 已提交
36
        return;
S
starlord 已提交
37
    }
Y
yu yunfeng 已提交
38 39 40

    initialized_ = true;

Y
yu yunfeng 已提交
41
    // initialize CPU information
G
groot 已提交
42 43 44 45 46 47 48 49
    try {
        struct tms time_sample;
        last_cpu_ = times(&time_sample);
        last_sys_cpu_ = time_sample.tms_stime;
        last_user_cpu_ = time_sample.tms_utime;
        num_processors_ = 0;
        FILE* file = fopen("/proc/cpuinfo", "r");
        if (file) {
C
Cai Yudong 已提交
50
            char line[128];
G
groot 已提交
51 52 53 54 55 56 57 58 59 60 61
            while (fgets(line, 128, file) != nullptr) {
                if (strncmp(line, "processor", 9) == 0) {
                    num_processors_++;
                }
                if (strncmp(line, "physical", 8) == 0) {
                    num_physical_processors_ = ParseLine(line);
                }
            }
            fclose(file);
        } else {
            LOG_SERVER_ERROR_ << "Failed to read /proc/cpuinfo";
K
kun yu 已提交
62
        }
G
groot 已提交
63 64 65 66
        total_ram_ = GetPhysicalMemory();
    } catch (std::exception& ex) {
        std::string msg = "Failed to read /proc/cpuinfo, reason: " + std::string(ex.what());
        LOG_SERVER_ERROR_ << msg;
Y
yu yunfeng 已提交
67 68
    }

Y
youny626 已提交
69
#ifdef MILVUS_GPU_VERSION
S
starlord 已提交
70
    // initialize GPU information
Y
yu yunfeng 已提交
71 72
    nvmlReturn_t nvmlresult;
    nvmlresult = nvmlInit();
S
shengjh 已提交
73
    fiu_do_on("SystemInfo.Init.nvmInit_fail", nvmlresult = NVML_ERROR_NOT_FOUND);
S
starlord 已提交
74
    if (NVML_SUCCESS != nvmlresult) {
75
        LOG_SERVER_ERROR_ << "System information initilization failed";
S
starlord 已提交
76
        return;
Y
yu yunfeng 已提交
77
    }
Y
yu yunfeng 已提交
78
    nvmlresult = nvmlDeviceGetCount(&num_device_);
S
shengjh 已提交
79
    fiu_do_on("SystemInfo.Init.nvm_getDevice_fail", nvmlresult = NVML_ERROR_NOT_FOUND);
S
starlord 已提交
80
    if (NVML_SUCCESS != nvmlresult) {
81
        LOG_SERVER_ERROR_ << "Unable to get devidce number";
S
starlord 已提交
82
        return;
Y
yu yunfeng 已提交
83
    }
Y
youny626 已提交
84
#endif
Y
yu yunfeng 已提交
85

S
starlord 已提交
86
    // initialize network traffic information
G
groot 已提交
87
    try {
C
Cai Yudong 已提交
88
        std::pair<int64_t, int64_t> in_and_out_octets = Octets();
G
groot 已提交
89 90 91 92 93 94 95
        in_octets_ = in_and_out_octets.first;
        out_octets_ = in_and_out_octets.second;
        net_time_ = std::chrono::system_clock::now();
    } catch (std::exception& ex) {
        std::string msg = "Failed to initialize network traffic information, reason: " + std::string(ex.what());
        LOG_SERVER_ERROR_ << msg;
    }
Y
yu yunfeng 已提交
96 97
}

C
Cai Yudong 已提交
98
int64_t
S
starlord 已提交
99
SystemInfo::ParseLine(char* line) {
Y
yu yunfeng 已提交
100 101
    // This assumes that a digit will be found and the line ends in " Kb".
    int i = strlen(line);
S
starlord 已提交
102
    const char* p = line;
S
starlord 已提交
103 104 105
    while (*p < '0' || *p > '9') {
        p++;
    }
Y
yu yunfeng 已提交
106 107
    line[i - 3] = '\0';
    i = atoi(p);
C
Cai Yudong 已提交
108
    return i;
Y
yu yunfeng 已提交
109 110
}

C
Cai Yudong 已提交
111
int64_t
Y
yu yunfeng 已提交
112 113
SystemInfo::GetPhysicalMemory() {
    struct sysinfo memInfo;
S
starlord 已提交
114
    sysinfo(&memInfo);
C
Cai Yudong 已提交
115
    int64_t totalPhysMem = memInfo.totalram;
S
starlord 已提交
116
    // Multiply in next statement to avoid int overflow on right hand side...
Y
yu yunfeng 已提交
117
    totalPhysMem *= memInfo.mem_unit;
G
groot 已提交
118

Y
yu yunfeng 已提交
119 120 121
    return totalPhysMem;
}

C
Cai Yudong 已提交
122
int64_t
Y
yu yunfeng 已提交
123
SystemInfo::GetProcessUsedMemory() {
G
groot 已提交
124 125 126
    try {
        // Note: this value is in KB!
        FILE* file = fopen("/proc/self/status", "r");
C
Cai Yudong 已提交
127 128
        int64_t result = 0;
        constexpr int64_t KB = 1024;
G
groot 已提交
129
        if (file) {
C
Cai Yudong 已提交
130
            constexpr int64_t line_length = 128;
G
groot 已提交
131 132 133 134 135 136 137 138 139 140 141
            char line[line_length];

            while (fgets(line, line_length, file) != nullptr) {
                if (strncmp(line, "VmRSS:", 6) == 0) {
                    result = ParseLine(line);
                    break;
                }
            }
            fclose(file);
        } else {
            LOG_SERVER_ERROR_ << "Failed to read /proc/self/status";
Y
yu yunfeng 已提交
142
        }
G
groot 已提交
143 144

        // return value in Byte
C
Cai Yudong 已提交
145
        return (result * KB);
G
groot 已提交
146 147 148 149
    } catch (std::exception& ex) {
        std::string msg = "Failed to read /proc/self/status, reason: " + std::string(ex.what());
        LOG_SERVER_ERROR_ << msg;
        return 0;
Y
yu yunfeng 已提交
150 151 152 153 154
    }
}

double
SystemInfo::MemoryPercent() {
S
shengjh 已提交
155
    fiu_do_on("SystemInfo.MemoryPercent.mock", initialized_ = false);
S
starlord 已提交
156
    if (!initialized_) {
S
starlord 已提交
157
        Init();
S
starlord 已提交
158 159
    }

C
Cai Yudong 已提交
160
    auto mem_used = static_cast<double>(GetProcessUsedMemory() * 100);
S
starlord 已提交
161
    return mem_used / static_cast<double>(total_ram_);
Y
yu yunfeng 已提交
162 163
}

K
kun yu 已提交
164 165
std::vector<double>
SystemInfo::CPUCorePercent() {
C
Cai Yudong 已提交
166 167
    std::vector<int64_t> prev_work_time_array;
    std::vector<int64_t> prev_total_time_array = getTotalCpuTime(prev_work_time_array);
K
kun yu 已提交
168
    usleep(100000);
C
Cai Yudong 已提交
169 170
    std::vector<int64_t> cur_work_time_array;
    std::vector<int64_t> cur_total_time_array = getTotalCpuTime(cur_work_time_array);
K
kun yu 已提交
171 172

    std::vector<double> cpu_core_percent;
C
Cai Yudong 已提交
173
    for (size_t i = 0; i < cur_total_time_array.size(); i++) {
K
kun yu 已提交
174 175 176 177 178 179 180
        double total_cpu_time = cur_total_time_array[i] - prev_total_time_array[i];
        double cpu_work_time = cur_work_time_array[i] - prev_work_time_array[i];
        cpu_core_percent.push_back((cpu_work_time / total_cpu_time) * 100);
    }
    return cpu_core_percent;
}

C
Cai Yudong 已提交
181 182 183
std::vector<int64_t>
SystemInfo::getTotalCpuTime(std::vector<int64_t>& work_time_array) {
    std::vector<int64_t> total_time_array;
G
groot 已提交
184 185
    try {
        FILE* file = fopen("/proc/stat", "r");
C
cqy123456 已提交
186
        fiu_do_on("SystemInfo.getTotalCpuTime.open_proc", file = nullptr);
C
Cai Yudong 已提交
187
        if (file == nullptr) {
G
groot 已提交
188
            LOG_SERVER_ERROR_ << "Failed to read /proc/stat";
K
kun yu 已提交
189 190 191
            return total_time_array;
        }

C
Cai Yudong 已提交
192 193
        int64_t user = 0, nice = 0, system = 0, idle = 0;
        int64_t iowait = 0, irq = 0, softirq = 0, steal = 0, guest = 0, guestnice = 0;
G
groot 已提交
194 195 196 197

        for (int i = 0; i < num_processors_; i++) {
            char buffer[1024];
            char* ret = fgets(buffer, sizeof(buffer) - 1, file);
C
cqy123456 已提交
198
            fiu_do_on("SystemInfo.getTotalCpuTime.read_proc", ret = nullptr);
C
Cai Yudong 已提交
199
            if (ret == nullptr) {
G
groot 已提交
200 201 202 203 204
                LOG_SERVER_ERROR_ << "Could not read stat file";
                fclose(file);
                return total_time_array;
            }

C
Cai Yudong 已提交
205
            sscanf(buffer, "cpu  %16ld %16ld %16ld %16ld %16ld %16ld %16ld %16ld %16ld %16ld", &user, &nice, &system,
G
groot 已提交
206
                   &idle, &iowait, &irq, &softirq, &steal, &guest, &guestnice);
K
kun yu 已提交
207

G
groot 已提交
208 209 210 211 212 213 214 215
            work_time_array.push_back(user + nice + system);
            total_time_array.push_back(user + nice + system + idle + iowait + irq + softirq + steal);
        }

        fclose(file);
    } catch (std::exception& ex) {
        std::string msg = "Failed to read /proc/stat, reason: " + std::string(ex.what());
        LOG_SERVER_ERROR_ << msg;
K
kun yu 已提交
216 217 218 219 220
    }

    return total_time_array;
}

Y
yu yunfeng 已提交
221 222
double
SystemInfo::CPUPercent() {
S
shengjh 已提交
223
    fiu_do_on("SystemInfo.CPUPercent.mock", initialized_ = false);
S
starlord 已提交
224
    if (!initialized_) {
S
starlord 已提交
225
        Init();
S
starlord 已提交
226
    }
Y
yu yunfeng 已提交
227
    struct tms time_sample;
Y
yu yunfeng 已提交
228 229 230
    clock_t now;
    double percent;

Y
yu yunfeng 已提交
231
    now = times(&time_sample);
S
starlord 已提交
232 233
    if (now <= last_cpu_ || time_sample.tms_stime < last_sys_cpu_ || time_sample.tms_utime < last_user_cpu_) {
        // Overflow detection. Just skip this value.
Y
yu yunfeng 已提交
234
        percent = -1.0;
S
starlord 已提交
235
    } else {
S
starlord 已提交
236
        percent = (time_sample.tms_stime - last_sys_cpu_) + (time_sample.tms_utime - last_user_cpu_);
Y
yu yunfeng 已提交
237
        percent /= (now - last_cpu_);
Y
yu yunfeng 已提交
238 239
        percent *= 100;
    }
Y
yu yunfeng 已提交
240 241 242
    last_cpu_ = now;
    last_sys_cpu_ = time_sample.tms_stime;
    last_user_cpu_ = time_sample.tms_utime;
Y
yu yunfeng 已提交
243 244 245 246

    return percent;
}

C
Cai Yudong 已提交
247
std::vector<int64_t>
K
kun yu 已提交
248
SystemInfo::GPUMemoryTotal() {
Y
yu yunfeng 已提交
249
    // get GPU usage percent
S
shengjh 已提交
250
    fiu_do_on("SystemInfo.GPUMemoryTotal.mock", initialized_ = false);
C
Cai Yudong 已提交
251
    if (!initialized_) {
S
starlord 已提交
252
        Init();
C
Cai Yudong 已提交
253
    }
C
Cai Yudong 已提交
254
    std::vector<int64_t> result;
Y
youny626 已提交
255 256

#ifdef MILVUS_GPU_VERSION
K
kun yu 已提交
257
    nvmlMemory_t nvmlMemory;
C
Cai Yudong 已提交
258
    for (uint32_t i = 0; i < num_device_; ++i) {
Y
yu yunfeng 已提交
259 260
        nvmlDevice_t device;
        nvmlDeviceGetHandleByIndex(i, &device);
K
kun yu 已提交
261 262
        nvmlDeviceGetMemoryInfo(device, &nvmlMemory);
        result.push_back(nvmlMemory.total);
Y
yu yunfeng 已提交
263
    }
Y
youny626 已提交
264 265
#endif

Y
yu yunfeng 已提交
266 267 268
    return result;
}

C
Cai Yudong 已提交
269
std::vector<int64_t>
S
starlord 已提交
270
SystemInfo::GPUTemperature() {
S
shengjh 已提交
271
    fiu_do_on("SystemInfo.GPUTemperature.mock", initialized_ = false);
C
Cai Yudong 已提交
272
    if (!initialized_) {
S
starlord 已提交
273
        Init();
C
Cai Yudong 已提交
274
    }
C
Cai Yudong 已提交
275
    std::vector<int64_t> result;
Y
youny626 已提交
276 277

#ifdef MILVUS_GPU_VERSION
C
Cai Yudong 已提交
278
    for (uint32_t i = 0; i < num_device_; i++) {
K
kun yu 已提交
279 280 281
        nvmlDevice_t device;
        nvmlDeviceGetHandleByIndex(i, &device);
        unsigned int temp;
S
starlord 已提交
282
        nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &temp);
K
kun yu 已提交
283 284
        result.push_back(temp);
    }
Y
youny626 已提交
285 286
#endif

K
kun yu 已提交
287 288
    return result;
}
S
starlord 已提交
289

K
kun yu 已提交
290
std::vector<float>
S
starlord 已提交
291
SystemInfo::CPUTemperature() {
K
kun yu 已提交
292
    std::vector<float> result;
293
    std::string path = "/sys/class/hwmon/";
G
groot 已提交
294 295
    try {
        DIR* dir = opendir(path.c_str());
C
cqy123456 已提交
296
        fiu_do_on("SystemInfo.CPUTemperature.opendir", dir = nullptr);
G
groot 已提交
297 298 299 300
        if (!dir) {
            LOG_SERVER_ERROR_ << "Could not open hwmon directory";
            return result;
        }
301

C
Cai Yudong 已提交
302 303
        struct dirent* ptr = nullptr;
        while ((ptr = readdir(dir)) != nullptr) {
G
groot 已提交
304 305 306 307 308 309 310 311 312 313
            std::string filename(path);
            filename.append(ptr->d_name);

            char buf[100];
            if (readlink(filename.c_str(), buf, 100) != -1) {
                std::string m(buf);
                if (m.find("coretemp") != std::string::npos) {
                    std::string object = filename;
                    object += "/temp1_input";
                    FILE* file = fopen(object.c_str(), "r");
C
cqy123456 已提交
314
                    fiu_do_on("SystemInfo.CPUTemperature.openfile", file = nullptr);
G
groot 已提交
315 316 317 318 319
                    if (file == nullptr) {
                        LOG_SERVER_ERROR_ << "Could not open temperature file";
                        return result;
                    }
                    float temp;
320 321 322
                    if (fscanf(file, "%f", &temp) != -1) {
                        result.push_back(temp / 1000);
                    }
G
groot 已提交
323
                    fclose(file);
324 325
                }
            }
K
kun yu 已提交
326
        }
G
groot 已提交
327 328 329 330
        closedir(dir);
    } catch (std::exception& ex) {
        std::string msg = "Failed to get cpu temperature, reason: " + std::string(ex.what());
        LOG_SERVER_ERROR_ << msg;
K
kun yu 已提交
331
    }
G
groot 已提交
332

333
    return result;
K
kun yu 已提交
334 335
}

C
Cai Yudong 已提交
336
std::vector<int64_t>
Y
yu yunfeng 已提交
337 338
SystemInfo::GPUMemoryUsed() {
    // get GPU memory used
S
shengjh 已提交
339
    fiu_do_on("SystemInfo.GPUMemoryUsed.mock", initialized_ = false);
C
Cai Yudong 已提交
340
    if (!initialized_) {
S
starlord 已提交
341
        Init();
C
Cai Yudong 已提交
342
    }
Y
yu yunfeng 已提交
343

C
Cai Yudong 已提交
344
    std::vector<int64_t> result;
Y
youny626 已提交
345 346

#ifdef MILVUS_GPU_VERSION
Y
yu yunfeng 已提交
347
    nvmlMemory_t nvmlMemory;
C
Cai Yudong 已提交
348
    for (uint32_t i = 0; i < num_device_; ++i) {
Y
yu yunfeng 已提交
349 350 351 352 353
        nvmlDevice_t device;
        nvmlDeviceGetHandleByIndex(i, &device);
        nvmlDeviceGetMemoryInfo(device, &nvmlMemory);
        result.push_back(nvmlMemory.used);
    }
Y
youny626 已提交
354 355
#endif

Y
yu yunfeng 已提交
356 357 358
    return result;
}

C
Cai Yudong 已提交
359
std::pair<int64_t, int64_t>
S
starlord 已提交
360
SystemInfo::Octets() {
Y
yu yunfeng 已提交
361 362 363 364
    const std::string filename = "/proc/net/netstat";
    std::ifstream file(filename);
    std::string lastline = "";
    std::string line = "";
S
starlord 已提交
365
    while (true) {
Y
yu yunfeng 已提交
366
        getline(file, line);
S
starlord 已提交
367
        if (file.fail()) {
Y
yu yunfeng 已提交
368 369 370 371 372
            break;
        }
        lastline = line;
    }
    std::vector<size_t> space_position;
C
Cai Yudong 已提交
373
    size_t space_pos = lastline.find(' ');
S
starlord 已提交
374
    while (space_pos != std::string::npos) {
Y
yu yunfeng 已提交
375
        space_position.push_back(space_pos);
C
Cai Yudong 已提交
376
        space_pos = lastline.find(' ', space_pos + 1);
Y
yu yunfeng 已提交
377 378
    }
    // InOctets is between 6th and 7th " " and OutOctets is between 7th and 8th " "
S
starlord 已提交
379 380 381 382 383 384 385
    size_t inoctets_begin = space_position[6] + 1;
    size_t inoctets_length = space_position[7] - inoctets_begin;
    size_t outoctets_begin = space_position[7] + 1;
    size_t outoctets_length = space_position[8] - outoctets_begin;
    std::string inoctets = lastline.substr(inoctets_begin, inoctets_length);
    std::string outoctets = lastline.substr(outoctets_begin, outoctets_length);

C
Cai Yudong 已提交
386 387 388
    int64_t inoctets_bytes = std::stoull(inoctets);
    int64_t outoctets_bytes = std::stoull(outoctets);
    std::pair<int64_t, int64_t> res(inoctets_bytes, outoctets_bytes);
Y
yu yunfeng 已提交
389 390 391
    return res;
}

392 393 394 395 396 397 398
void
SystemInfo::GetSysInfoJsonStr(std::string& result) {
    std::map<std::string, std::string> sys_info_map;

    sys_info_map["memory_total"] = std::to_string(GetPhysicalMemory());
    sys_info_map["memory_used"] = std::to_string(GetProcessUsedMemory());

C
Cai Yudong 已提交
399 400
    std::vector<int64_t> gpu_mem_total = GPUMemoryTotal();
    std::vector<int64_t> gpu_mem_used = GPUMemoryUsed();
401 402 403 404 405 406 407 408 409 410 411
    for (size_t i = 0; i < gpu_mem_total.size(); i++) {
        std::string key_total = "gpu" + std::to_string(i) + "_memory_total";
        std::string key_used = "gpu" + std::to_string(i) + "_memory_used";
        sys_info_map[key_total] = std::to_string(gpu_mem_total[i]);
        sys_info_map[key_used] = std::to_string(gpu_mem_used[i]);
    }

    nlohmann::json sys_info_json(sys_info_map);
    result = sys_info_json.dump();
}

S
starlord 已提交
412 413
}  // namespace server
}  // namespace milvus