Merge branch 'branch-0.3.1' into branch-0.4.0

Former-commit-id: b479db2044d96bd9cd84e3ed6d5a33c8df2442a6

Merge branch 'branch-0.3.1' into branch-0.4.0
Former-commit-id: b479db2044d96bd9cd84e3ed6d5a33c8df2442a6
79ea7ecb · jinhai · 9edd391f · 2de15145 · 79ea7ecb · 79ea7ecb
12 changed file
--- a/ci/jenkinsfile/dev_test.groovy
+++ b/ci/jenkinsfile/dev_test.groovy
-timeout(time: 30, unit: 'MINUTES') {
+timeout(time: 40, unit: 'MINUTES') {
    try {
        dir ("${PROJECT_NAME}_test") {
            checkout([$class: 'GitSCM', branches: [[name: "${SEMVER}"]], doGenerateSubmoduleConfigurations: false, extensions: [], submoduleCfg: [], userRemoteConfigs: [[credentialsId: "${params.GIT_USER}", url: "git@192.168.1.105:Test/milvus_test.git", name: 'origin', refspec: "+refs/heads/${SEMVER}:refs/remotes/origin/${SEMVER}"]]])

--- a/cpp/CHANGELOG.md
+++ b/cpp/CHANGELOG.md
@@ -50,7 +50,11 @@ Please mark all change in change log and use the ticket from JIRA.
 - MS-261 - Update faiss version to 1.5.3 and add BUILD_FAISS_WITH_MKL as an option
 - MS-266 - Improve topk reduce time by using multi-threads
 - MS-275 - Avoid sqlite logic error excetion
- MS-278 - add IndexStatsHelper
+- MS-278 - Add IndexStatsHelper
+- MS-312 - Set openmp thread number by config
+- MS-305 - Add CPU core percent metric
+- MS-310 - Add milvus CPU utilization ratio and CPU/GPU temperature metrics
+- MS-324 - Show error when there is not enough gpu memory to build index

 ## New Feature
 - MS-180 - Add new mem manager

--- a/cpp/conf/server_config.template
+++ b/cpp/conf/server_config.template
@@ -43,4 +43,5 @@ engine_config:
  nprobe: 10
  nlist: 16384
  use_blas_threshold: 20
-  metric_type: L2       # compare vectors by euclidean distance(L2) or inner product(IP), optional: L2 or IP
+  metric_type: L2               # compare vectors by euclidean distance(L2) or inner product(IP), optional: L2 or IP
+  omp_thread_num: 0             # how many compute threads be used by engine, 0 means use all cpu core to compute
--- a/cpp/src/db/DBImpl.cpp
+++ b/cpp/src/db/DBImpl.cpp
@@ -326,7 +326,8 @@ void DBImpl::StartMetricTask() {
    server::Metrics::GetInstance().OctetsSet();

    server::Metrics::GetInstance().CPUCoreUsagePercentSet();
-
+    server::Metrics::GetInstance().GPUTemperature();
+    server::Metrics::GetInstance().CPUTemperature();

    ENGINE_LOG_TRACE << "Metric task finished";
 }
@@ -541,11 +542,27 @@ Status DBImpl::BuildIndex(const meta::TableFileSchema& file) {
        }

        //step 3: build index
-        auto start_time = METRICS_NOW_TIME;
-        auto index = to_index->BuildIndex(table_file.location_);
-        auto end_time = METRICS_NOW_TIME;
-        auto total_time = METRICS_MICROSECONDS(start_time, end_time);
-        server::Metrics::GetInstance().BuildIndexDurationSecondsHistogramObserve(total_time);
+        std::shared_ptr<ExecutionEngine> index;
+
+        try {
+            auto start_time = METRICS_NOW_TIME;
+            index = to_index->BuildIndex(table_file.location_);
+            auto end_time = METRICS_NOW_TIME;
+            auto total_time = METRICS_MICROSECONDS(start_time, end_time);
+            server::Metrics::GetInstance().BuildIndexDurationSecondsHistogramObserve(total_time);
+        } catch (std::exception& ex) {
+            //typical error: out of gpu memory
+            std::string msg = "BuildIndex encounter exception" + std::string(ex.what());
+            ENGINE_LOG_ERROR << msg;
+
+            table_file.file_type_ = meta::TableFileSchema::TO_DELETE;
+            status = meta_ptr_->UpdateTableFile(table_file);
+            ENGINE_LOG_DEBUG << "Failed to update file to index, mark file: " << table_file.file_id_ << " to to_delete";
+
+            std::cout << "ERROR: failed to build index, index file is too large or gpu memory is not enough" << std::endl;
+
+            return Status::Error(msg);
+        }

        //step 4: if table has been deleted, dont save index file
        bool has_table = false;
@@ -556,7 +573,22 @@ Status DBImpl::BuildIndex(const meta::TableFileSchema& file) {
        }

        //step 5: save index file
-        index->Serialize();
+        try {
+            index->Serialize();
+        } catch (std::exception& ex) {
+            //typical error: out of disk space or permition denied
+            std::string msg = "Serialize index encounter exception" + std::string(ex.what());
+            ENGINE_LOG_ERROR << msg;
+
+            table_file.file_type_ = meta::TableFileSchema::TO_DELETE;
+            status = meta_ptr_->UpdateTableFile(table_file);
+            ENGINE_LOG_DEBUG << "Failed to update file to index, mark file: " << table_file.file_id_ << " to to_delete";
+
+            std::cout << "ERROR: failed to persist index file: " << table_file.location_
+                << ", possible out of disk space" << std::endl;
+
+            return Status::Error(msg);
+        }

        //step 6: update meta
        table_file.file_type_ = meta::TableFileSchema::INDEX;

--- a/cpp/src/db/DBMetaImpl.cpp
+++ b/cpp/src/db/DBMetaImpl.cpp
@@ -1005,7 +1005,7 @@ Status DBMetaImpl::CleanUpFilesWithTTL(uint16_t seconds) {
                table_file.date_ = std::get<3>(file);

                utils::DeleteTableFilePath(options_, table_file);
-                ENGINE_LOG_DEBUG << "Removing file id:" << table_file.id_ << " location:" << table_file.location_;
+                ENGINE_LOG_DEBUG << "Removing file id:" << table_file.file_id_ << " location:" << table_file.location_;
                ConnectorPtr->remove<TableFileSchema>(table_file.id_);

            }

--- a/cpp/src/metrics/MetricBase.h
+++ b/cpp/src/metrics/MetricBase.h
@@ -66,6 +66,8 @@ class MetricsBase{
    virtual void OctetsSet() {};

    virtual void CPUCoreUsagePercentSet() {};
+    virtual void GPUTemperature() {};
+    virtual void CPUTemperature() {};
 };



--- a/cpp/src/metrics/PrometheusMetrics.cpp
+++ b/cpp/src/metrics/PrometheusMetrics.cpp
@@ -34,8 +34,6 @@ PrometheusMetrics::Init() {
        return SERVER_UNEXPECTED_ERROR;
    }

-    //
-
    return SERVER_SUCCESS;

 }
@@ -44,8 +42,6 @@ PrometheusMetrics::Init() {
 void
 PrometheusMetrics::CPUUsagePercentSet()  {
    if(!startup_) return ;
-    int numProcessor = server::SystemInfo::GetInstance().num_processor();
-
    double usage_percent = server::SystemInfo::GetInstance().CPUPercent();
    CPU_usage_percent_.Set(usage_percent);
 }
@@ -64,13 +60,11 @@ PrometheusMetrics::GPUPercentGaugeSet() {
    std::vector<unsigned long long > used_total = server::SystemInfo::GetInstance().GPUMemoryTotal();
    std::vector<unsigned long long > used_memory = server::SystemInfo::GetInstance().GPUMemoryUsed();

-
-    for (int i = 0; i < numDevice; i++) {
+    for (int i = 0; i < numDevice; ++i) {
        prometheus::Gauge &GPU_percent = GPU_percent_.Add({{"DeviceNum", std::to_string(i)}});
        double percent = (double)used_memory[i] / (double)used_total[i];
        GPU_percent.Set(percent * 100);
    }
-
 }

 void PrometheusMetrics::GPUMemoryUsageGaugeSet() {
@@ -79,7 +73,7 @@ void PrometheusMetrics::GPUMemoryUsageGaugeSet() {
    constexpr unsigned long long MtoB = 1024*1024;
    int numDevice = server::SystemInfo::GetInstance().num_device();

-    for (int i = 0; i < numDevice; i++) {
+    for (int i = 0; i < numDevice; ++i) {
        prometheus::Gauge &GPU_memory = GPU_memory_usage_.Add({{"DeviceNum", std::to_string(i)}});
        GPU_memory.Set(values[i] / MtoB);
    }
@@ -142,12 +136,35 @@ void PrometheusMetrics::CPUCoreUsagePercentSet() {

    std::vector<double> cpu_core_percent = server::SystemInfo::GetInstance().CPUCorePercent();

-    for (int i = 0; i < cpu_core_percent.size(); i++) {
+    for (int i = 0; i < cpu_core_percent.size(); ++i) {
        prometheus::Gauge &core_percent = CPU_.Add({{"CPU", std::to_string(i)}});
        core_percent.Set(cpu_core_percent[i]);
    }
 }

+void PrometheusMetrics::GPUTemperature() {
+    if (!startup_)
+        return;
+
+    std::vector<unsigned int> GPU_temperatures = server::SystemInfo::GetInstance().GPUTemperature();
+
+    for (int i = 0; i < GPU_temperatures.size(); ++i) {
+        prometheus::Gauge &gpu_temp = GPU_temperature_.Add({{"GPU", std::to_string(i)}});
+        gpu_temp.Set(GPU_temperatures[i]);
+    }
+}
+
+void PrometheusMetrics::CPUTemperature() {
+    if (!startup_)
+        return;
+
+    std::vector<float> CPU_temperatures = server::SystemInfo::GetInstance().CPUTemperature();
+
+    for (int i = 0; i < CPU_temperatures.size(); ++i) {
+        prometheus::Gauge &cpu_temp = CPU_temperature_.Add({{"CPU", std::to_string(i)}});
+        cpu_temp.Set(CPU_temperatures[i]);
+    }
+}

 }
 }

--- a/cpp/src/metrics/PrometheusMetrics.h
+++ b/cpp/src/metrics/PrometheusMetrics.h
@@ -79,7 +79,6 @@ class PrometheusMetrics: public MetricsBase {
    void QueryVectorResponseSummaryObserve(double value, int count = 1) override { if (startup_) for(int i = 0 ; i < count ; ++i) query_vector_response_summary_.Observe(value);};
    void QueryVectorResponsePerSecondGaugeSet(double value) override {if (startup_) query_vector_response_per_second_gauge_.Set(value);};
    void CPUUsagePercentSet() override ;
-
    void CPUCoreUsagePercentSet() override;

    void RAMUsagePercentSet() override ;
@@ -93,6 +92,9 @@ class PrometheusMetrics: public MetricsBase {
    void KeepingAliveCounterIncrement(double value = 1) override {if(startup_) keeping_alive_counter_.Increment(value);};
    void OctetsSet() override ;

+    void GPUTemperature() override;
+    void CPUTemperature() override;
+



@@ -396,7 +398,7 @@ class PrometheusMetrics: public MetricsBase {
        .Name("CPU_usage_percent")
        .Help("CPU usage percent by this this process")
        .Register(*registry_);
-    prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "0"}});
+    prometheus::Gauge &CPU_usage_percent_ = CPU_.Add({{"CPU", "avg"}});


    prometheus::Family<prometheus::Gauge> &RAM_ = prometheus::BuildGauge()
@@ -444,6 +446,15 @@ class PrometheusMetrics: public MetricsBase {
    prometheus::Gauge &outoctets_gauge_ = octets_.Add({{"type", "outoctets"}});


+    prometheus::Family<prometheus::Gauge> &GPU_temperature_ = prometheus::BuildGauge()
+        .Name("GPU_temperature")
+        .Help("GPU temperature")
+        .Register(*registry_);
+
+    prometheus::Family<prometheus::Gauge> &CPU_temperature_ = prometheus::BuildGauge()
+        .Name("CPU_temperature")
+        .Help("CPU temperature")
+        .Register(*registry_);

 };


--- a/cpp/src/metrics/SystemInfo.cpp
+++ b/cpp/src/metrics/SystemInfo.cpp
@@ -36,6 +36,9 @@ void SystemInfo::Init() {
    num_processors_ = 0;
    while(fgets(line, 128, file) != NULL){
        if (strncmp(line, "processor", 9) == 0) num_processors_++;
+        if (strncmp(line, "physical", 8) == 0) {
+            num_physical_processors_ = ParseLine(line);
+        }
    }
    total_ram_ = GetPhysicalMemory();
    fclose(file);
@@ -108,8 +111,6 @@ SystemInfo::MemoryPercent() {
    return (double)(GetProcessUsedMemory()*100)/(double)total_ram_;
 }

-
-
 std::vector<double>
 SystemInfo::CPUCorePercent() {
    std::vector<unsigned long long> prev_work_time_array;
@@ -119,7 +120,7 @@ SystemInfo::CPUCorePercent() {
    std::vector<unsigned long long> cur_total_time_array = getTotalCpuTime(cur_work_time_array);

    std::vector<double> cpu_core_percent;
-    for (int i = 0; i < num_processors_; i++) {
+    for (int i = 1; i < num_processors_; i++) {
        double total_cpu_time = cur_total_time_array[i] - prev_total_time_array[i];
        double cpu_work_time = cur_work_time_array[i] - prev_work_time_array[i];
        cpu_core_percent.push_back((cpu_work_time / total_cpu_time) * 100);
@@ -181,7 +182,6 @@ SystemInfo::CPUPercent() {
        percent = (time_sample.tms_stime - last_sys_cpu_) +
            (time_sample.tms_utime - last_user_cpu_);
        percent /= (now - last_cpu_);
-        percent /= num_processors_;
        percent *= 100;
    }
    last_cpu_ = now;
@@ -207,6 +207,36 @@ SystemInfo::GPUMemoryTotal() {
    return result;
 }

+std::vector<unsigned int>
+SystemInfo::GPUTemperature(){
+    if(!initialized_) Init();
+    std::vector<unsigned int > result;
+    for (int i = 0; i < num_device_; i++) {
+        nvmlDevice_t device;
+        nvmlDeviceGetHandleByIndex(i, &device);
+        unsigned int temp;
+        nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU,&temp);
+        result.push_back(temp);
+    }
+    return result;
+}
+std::vector<float>
+SystemInfo::CPUTemperature(){
+    std::vector<float> result;
+    for (int i = 0; i <= num_physical_processors_; ++i) {
+        std::string path = "/sys/class/thermal/thermal_zone" + std::to_string(i) + "/temp";
+        FILE *file = fopen(path.data(), "r");
+        if (file == NULL) {
+            perror("Could not open thermal file");
+            return result;
+        }
+        float temp;
+        fscanf(file, "%f", &temp);
+        result.push_back(temp / 1000);
+    }
+
+}
+
 std::vector<unsigned long long>
 SystemInfo::GPUMemoryUsed() {
    // get GPU memory used

--- a/cpp/src/metrics/SystemInfo.h
+++ b/cpp/src/metrics/SystemInfo.h
@@ -32,6 +32,7 @@ class SystemInfo {
    clock_t last_user_cpu_ = clock_t();
    std::chrono::system_clock::time_point net_time_ = std::chrono::system_clock::now();
    int num_processors_ = 0;
+    int num_physical_processors_ = 0;
    //number of GPU
    unsigned int num_device_ = 0;
    unsigned long long in_octets_ = 0;
@@ -47,6 +48,7 @@ class SystemInfo {

    void Init();
    int num_processor() const { return num_processors_;};
+    int num_physical_processors() const { return num_physical_processors_; };
    int num_device() const {return num_device_;};
    unsigned long long get_inoctets() { return in_octets_;};
    unsigned long long get_octets() { return out_octets_;};
@@ -65,7 +67,8 @@ class SystemInfo {

    std::vector<double> CPUCorePercent();
    std::vector<unsigned long long> getTotalCpuTime(std::vector<unsigned long long> &workTime);
-
+    std::vector<unsigned int> GPUTemperature();
+    std::vector<float> CPUTemperature();

 };


--- a/cpp/src/server/DBWrapper.cpp
+++ b/cpp/src/server/DBWrapper.cpp
@@ -10,11 +10,14 @@
 #include "utils/Log.h"
 #include "utils/StringHelpFunctions.h"

+#include <omp.h>
+
 namespace zilliz {
 namespace milvus {
 namespace server {

 DBWrapper::DBWrapper() {
+    //db config
    zilliz::milvus::engine::Options opt;
    ConfigNode& db_config = ServerConfig::GetInstance().GetConfig(CONFIG_DB);
    opt.meta.backend_uri = db_config.GetValue(CONFIG_DB_URL);
@@ -37,6 +40,7 @@ DBWrapper::DBWrapper() {
        kill(0, SIGUSR1);
    }

+    // cache config
    ConfigNode& cache_config = ServerConfig::GetInstance().GetConfig(CONFIG_CACHE);
    opt.insert_cache_immediately_ = cache_config.GetBoolValue(CONFIG_INSERT_CACHE_IMMEDIATELY, false);

@@ -56,6 +60,14 @@ DBWrapper::DBWrapper() {
        kill(0, SIGUSR1);
    }

+    // engine config
+    ConfigNode& engine_config = ServerConfig::GetInstance().GetConfig(CONFIG_ENGINE);
+    int32_t omp_thread = engine_config.GetInt32Value(CONFIG_OMP_THREAD_NUM, 0);
+    if(omp_thread > 0) {
+        omp_set_num_threads(omp_thread);
+        SERVER_LOG_DEBUG << "Specify openmp thread number: " << omp_thread;
+    }
+
    //set archive config
    engine::ArchiveConf::CriteriaT criterial;
    int64_t disk = db_config.GetInt64Value(CONFIG_DB_ARCHIVE_DISK, 0);

--- a/cpp/src/server/ServerConfig.h
+++ b/cpp/src/server/ServerConfig.h
@@ -53,6 +53,7 @@ static const std::string CONFIG_NPROBE = "nprobe";
 static const std::string CONFIG_NLIST = "nlist";
 static const std::string CONFIG_DCBT = "use_blas_threshold";
 static const std::string CONFIG_METRICTYPE = "metric_type";
+static const std::string CONFIG_OMP_THREAD_NUM = "omp_thread_num";

 class ServerConfig {
 public: