diff --git a/cpp/CHANGELOG.md b/cpp/CHANGELOG.md index 80d1a70c8b62ad046c00795f27d0cc39ad087bf9..7ecac566bb583a0d95a55c167d808b12fcd777ee 100644 --- a/cpp/CHANGELOG.md +++ b/cpp/CHANGELOG.md @@ -26,3 +26,4 @@ Please mark all change in change log and use the ticket from JIRA. - MS-30 - Use faiss v1.5.2 - MS-32 - Fix thrift error - MS-34 - Fix prometheus-cpp thirdparty +- MS-37 - Add query, cache usage, disk write speed and file data size metrics diff --git a/cpp/src/db/DBImpl.inl b/cpp/src/db/DBImpl.inl index 2170358a432632d6becf2240f9cd91607cd30279..706f8a481bf84972e8192ad9b49b98365d9915ec 100644 --- a/cpp/src/db/DBImpl.inl +++ b/cpp/src/db/DBImpl.inl @@ -71,17 +71,26 @@ Status DBImpl::InsertVectors(const std::string& table_id_, // server::Metrics::GetInstance().add_vector_duration_seconds_quantiles().Observe((average_time)); if (!status.ok()) { server::Metrics::GetInstance().AddVectorsFailTotalIncrement(n); + server::Metrics::GetInstance().AddVectorsFailGaugeSet(n); return status; } server::Metrics::GetInstance().AddVectorsSuccessTotalIncrement(n); + server::Metrics::GetInstance().AddVectorsSuccessGaugeSet(n); } template Status DBImpl::Query(const std::string &table_id, size_t k, size_t nq, const float *vectors, QueryResults &results) { - + auto start_time = METRICS_NOW_TIME; meta::DatesT dates = {meta::Meta::GetDate()}; - return Query(table_id, k, nq, vectors, dates, results); + Status result = Query(table_id, k, nq, vectors, dates, results); + auto end_time = METRICS_NOW_TIME; + auto total_time = METRICS_MICROSECONDS(start_time,end_time); + auto average_time = total_time / nq; + for (int i = 0; i < nq; ++i) { + server::Metrics::GetInstance().QueryResponseSummaryObserve(average_time); + } + return result; } template @@ -250,7 +259,12 @@ void DBImpl::BackgroundTimerTask(int interval) { if (shutting_down_.load(std::memory_order_acquire)) break; std::this_thread::sleep_for(std::chrono::seconds(interval)); - + int64_t cache_total = cache::CpuCacheMgr::GetInstance()->CacheUsage(); + LOG(DEBUG) << "Cache usage " << cache_total; + server::Metrics::GetInstance().CacheUsageGaugeSet(static_cast(cache_total)); + long size; + Size(size); + server::Metrics::GetInstance().DataFileSizeGaugeSet(size); TrySchedule(); } } diff --git a/cpp/src/db/MemManager.inl b/cpp/src/db/MemManager.inl index 35e7c70ada078af8514fd5a80195c8d1fa9c0ce2..528622795de8f8c29227bdeded2232bf07ad2b4e 100644 --- a/cpp/src/db/MemManager.inl +++ b/cpp/src/db/MemManager.inl @@ -8,6 +8,7 @@ #include "MemManager.h" #include "Meta.h" #include "MetaConsts.h" +#include "metrics/Metrics.h" #include #include @@ -48,8 +49,14 @@ template Status MemVectors::Serialize(std::string& table_id) { table_id = schema_.table_id; auto size = ApproximateSize(); + auto start_time = METRICS_NOW_TIME; pEE_->Serialize(); + auto end_time = METRICS_NOW_TIME; + auto total_time = METRICS_MICROSECONDS(start_time, end_time); schema_.size = size; + + server::Metrics::GetInstance().DiskStoreIOSpeedGaugeSet(size/total_time); + schema_.file_type = (size >= options_.index_trigger_size) ? meta::TableFileSchema::TO_INDEX : meta::TableFileSchema::RAW; diff --git a/cpp/src/main.cpp b/cpp/src/main.cpp index 61614e3557524069d7b018760ecbf8a303db48ae..08ecb8c194342dc154a93dd48ce16561edb458b0 100644 --- a/cpp/src/main.cpp +++ b/cpp/src/main.cpp @@ -11,6 +11,7 @@ #include #include #include +#include "metrics/Metrics.h" #include "utils/SignalUtil.h" #include "utils/CommonUtil.h" @@ -25,7 +26,6 @@ using namespace zilliz::vecwise; int main(int argc, char *argv[]) { printf("Vecwise engine server start...\n"); - // zilliz::lib::gpu::InitMemoryAllocator(); signal(SIGINT, server::SignalUtil::HandleSignal); diff --git a/cpp/src/metrics/MetricBase.h b/cpp/src/metrics/MetricBase.h index d720b8de44e3b3257d239ae9da5d053083343a7a..ccda229f2a8f54c2c65afa74f6d9a8f2917bcdf2 100644 --- a/cpp/src/metrics/MetricBase.h +++ b/cpp/src/metrics/MetricBase.h @@ -64,7 +64,11 @@ class MetricsBase{ virtual void IndexFileSizeGaugeSet(double value) {}; virtual void RawFileSizeGaugeSet(double value) {}; virtual void FaissDiskLoadIOSpeedGaugeSet(double value) {}; - + virtual void QueryResponseSummaryObserve(double value) {}; + virtual void DiskStoreIOSpeedGaugeSet(double value) {}; + virtual void DataFileSizeGaugeSet(double value) {}; + virtual void AddVectorsSuccessGaugeSet(double value) {}; + virtual void AddVectorsFailGaugeSet(double value) {}; }; diff --git a/cpp/src/metrics/Metrics.cpp b/cpp/src/metrics/Metrics.cpp index feb986b1629abbe17f0aee2bfd8d04eabe2d75e4..1bacf4ff0b89f51294faef6ceef77eec4cf2ddff 100644 --- a/cpp/src/metrics/Metrics.cpp +++ b/cpp/src/metrics/Metrics.cpp @@ -4,7 +4,6 @@ * Proprietary and confidential. ******************************************************************************/ -#pragma once #include "Metrics.h" #include "PrometheusMetrics.h" diff --git a/cpp/src/metrics/PrometheusMetrics.h b/cpp/src/metrics/PrometheusMetrics.h index 000aa31608985537f8c44795b5eb46c706f2a4cf..cebd48b5aa6b96f50228bc44d0a24752f0013595 100644 --- a/cpp/src/metrics/PrometheusMetrics.h +++ b/cpp/src/metrics/PrometheusMetrics.h @@ -97,7 +97,11 @@ class PrometheusMetrics: public MetricsBase { void RawFileSizeTotalIncrement(double value = 1) { if(startup_) raw_file_size_total_.Increment(value);}; void IndexFileSizeGaugeSet(double value) { if(startup_) index_file_size_gauge_.Set(value);}; void RawFileSizeGaugeSet(double value) { if(startup_) raw_file_size_gauge_.Set(value);}; - + void QueryResponseSummaryObserve(double value) {if(startup_) query_response_summary_.Observe(value);}; + void DiskStoreIOSpeedGaugeSet(double value) { if(startup_) disk_store_IO_speed_gauge_.Set(value);}; + void DataFileSizeGaugeSet(double value) { if(startup_) data_file_size_gauge_.Set(value);}; + void AddVectorsSuccessGaugeSet(double value) { if(startup_) add_vectors_success_gauge_.Set(value);}; + void AddVectorsFailGaugeSet(double value) { if(startup_) add_vectors_fail_gauge_.Set(value);}; @@ -295,11 +299,6 @@ class PrometheusMetrics: public MetricsBase { ////all form Cache.cpp //record cache usage, when insert/erase/clear/free - prometheus::Family &cache_usage_ = prometheus::BuildGauge() - .Name("cache_usage") - .Help("total bytes that cache used") - .Register(*registry_); - prometheus::Gauge &cache_usage_gauge_ = cache_usage_.Add({}); ////all from Meta.cpp @@ -386,6 +385,39 @@ class PrometheusMetrics: public MetricsBase { .Register(*registry_); prometheus::Counter &cache_access_total_ = cache_access_.Add({}); + // record cache usage and % + prometheus::Family &cache_usage_ = prometheus::BuildGauge() + .Name("cache_usage_bytes") + .Help("current cache usage by bytes") + .Register(*registry_); + prometheus::Gauge &cache_usage_gauge_ = cache_usage_.Add({}); + + // record query response + using Quantiles = std::vector; + prometheus::Family &query_response_ = prometheus::BuildSummary() + .Name("query_response_summary") + .Help("query response summary") + .Register(*registry_); + prometheus::Summary &query_response_summary_ = query_response_.Add({}, Quantiles{{0.95,0.00},{0.9,0.05},{0.8,0.1}}); + + prometheus::Family &disk_store_IO_speed_ = prometheus::BuildGauge() + .Name("disk_store_IO_speed_bytes_per_microseconds") + .Help("disk_store_IO_speed") + .Register(*registry_); + prometheus::Gauge &disk_store_IO_speed_gauge_ = disk_store_IO_speed_.Add({}); + + prometheus::Family &data_file_size_ = prometheus::BuildGauge() + .Name("data_file_size_bytes") + .Help("data file size by bytes") + .Register(*registry_); + prometheus::Gauge &data_file_size_gauge_ = data_file_size_.Add({}); + + prometheus::Family &add_vectors_ = prometheus::BuildGauge() + .Name("add_vectors") + .Help("current added vectors") + .Register(*registry_); + prometheus::Gauge &add_vectors_success_gauge_ = add_vectors_.Add({{"outcome", "success"}}); + prometheus::Gauge &add_vectors_fail_gauge_ = add_vectors_.Add({{"outcome", "fail"}}); }; diff --git a/cpp/src/server/Server.cpp b/cpp/src/server/Server.cpp index b27dd279f5768b4a0ed6f18455464e0feb362a02..a9901dc93e48442adef92dd9a223243d05cfce8c 100644 --- a/cpp/src/server/Server.cpp +++ b/cpp/src/server/Server.cpp @@ -138,7 +138,7 @@ int Server::Start() { // server::Metrics::GetInstance().Init(); // server::Metrics::GetInstance().exposer_ptr()->RegisterCollectable(server::Metrics::GetInstance().registry_ptr()); - server::Metrics::GetInstance().Init(); +// server::Metrics::GetInstance().Init(); if (daemonized_) { Daemonize(); @@ -177,7 +177,7 @@ Server::Start() { signal(SIGINT, SignalUtil::HandleSignal); signal(SIGHUP, SignalUtil::HandleSignal); signal(SIGTERM, SignalUtil::HandleSignal); - + server::Metrics::GetInstance().Init(); SERVER_LOG_INFO << "Vecwise server is running..."; StartService(); diff --git a/cpp/unittest/metrics/CMakeLists.txt b/cpp/unittest/metrics/CMakeLists.txt index 954f34d141165583c540214975f891ff7847e7b0..2560467c5b4b40e3e93e544e88f7bcd4485d6a1e 100644 --- a/cpp/unittest/metrics/CMakeLists.txt +++ b/cpp/unittest/metrics/CMakeLists.txt @@ -31,7 +31,8 @@ set(require_files ../../src/metrics/Metrics.cpp # ../../src/cache/CacheMgr.cpp -# ../../src/metrics/PrometheusMetrics.cpp + ../../src/metrics/PrometheusMetrics.cpp + ../../src/metrics/MetricBase.h ../../src/server/ServerConfig.cpp ../../src/utils/CommonUtil.cpp ../../src/utils/TimeRecorder.cpp diff --git a/cpp/unittest/metrics/metrics_test.cpp b/cpp/unittest/metrics/metrics_test.cpp index 31a57d5ac65de8b009d73a0acea13487d7823e5a..0efc36a3bf0ede2d78a7ff1e358166423f1ad119 100644 --- a/cpp/unittest/metrics/metrics_test.cpp +++ b/cpp/unittest/metrics/metrics_test.cpp @@ -32,7 +32,7 @@ TEST_F(DBTest, Metric_Tes) { // server::Metrics::GetInstance().exposer_ptr()->RegisterCollectable(server::Metrics::GetInstance().registry_ptr()); server::Metrics::GetInstance().Init(); // server::PrometheusMetrics::GetInstance().exposer_ptr()->RegisterCollectable(server::PrometheusMetrics::GetInstance().registry_ptr()); - zilliz::vecwise::cache::CpuCacheMgr::GetInstance()->SetCapacity(1*1024*1024*1024); + zilliz::vecwise::cache::CpuCacheMgr::GetInstance()->SetCapacity(4*1024*1024*1024); std::cout<CacheCapacity()<