From 0ce42fb0868fd74733dd1846fe054a2cc918402a Mon Sep 17 00:00:00 2001 From: zmxdream Date: Tue, 10 May 2022 11:06:23 +0800 Subject: [PATCH] merge develop. test=develop (#42624) --- .../framework/fleet/heter_ps/hashtable.h | 9 +- .../fleet/heter_ps/hashtable_kernel.cu | 32 ++++- .../fleet/heter_ps/hashtable_kernel.kps | 33 ++--- .../framework/fleet/heter_ps/heter_comm.h | 2 - .../framework/fleet/heter_ps/heter_comm_inl.h | 2 - .../framework/fleet/heter_ps/heter_ps.cu | 8 ++ .../fluid/framework/fleet/heter_ps/heter_ps.h | 2 - .../framework/fleet/heter_ps/heter_ps_base.h | 8 +- .../framework/fleet/heter_ps/optimizer.cuh.h | 73 ++++++----- .../framework/fleet/heter_ps/optimizer_conf.h | 95 ++++++++------ .../fluid/framework/fleet/ps_gpu_wrapper.cu | 32 ++--- .../fluid/framework/fleet/ps_gpu_wrapper.kps | 18 +-- paddle/fluid/framework/ps_gpu_trainer.cc | 116 ++++++++---------- paddle/fluid/framework/trainer.h | 2 +- 14 files changed, 212 insertions(+), 220 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable.h b/paddle/fluid/framework/fleet/heter_ps/hashtable.h index b860ea5d39c..e2f362d4074 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable.h @@ -41,9 +41,7 @@ limitations under the License. */ #include "xpu/kernel/simd.h" #endif -#if defined(PADDLE_WITH_XPU_KP) #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" -#endif namespace paddle { namespace framework { @@ -132,10 +130,8 @@ class HashTable { void show(); -#if defined(PADDLE_WITH_XPU_KP) void set_sparse_sgd(const OptimizerConfig& optimizer_config); void set_embedx_sgd(const OptimizerConfig& optimizer_config); -#endif template void dump_to_cpu(int devid, StreamType stream); @@ -178,9 +174,10 @@ class HashTable { TableContainer* container_; #elif defined(PADDLE_WITH_XPU_KP) XPUCacheArray* container_; - OptimizerConfig* xpu_optimizer_config_; - OptimizerConfig cpu_optimizer_config_; #endif + OptimizerConfig* device_optimizer_config_; + OptimizerConfig host_optimizer_config_; + int BLOCK_SIZE_{256}; float LOAD_FACTOR{0.75f}; size_t capacity_; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu index 87b62c6d380..df93f056917 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.cu @@ -95,6 +95,7 @@ __global__ void dy_mf_search_kernel(Table* table, template __global__ void update_kernel(Table* table, + const OptimizerConfig& optimizer_config, const typename Table::key_type* const keys, const GradType* const grads, size_t len, Sgd sgd) { @@ -102,13 +103,14 @@ __global__ void update_kernel(Table* table, if (i < len) { auto it = table->find(keys[i]); if (it != table->end()) { - sgd.update_value((it.getter())->second, grads[i]); + sgd.update_value(optimizer_config, (it.getter())->second, grads[i]); } } } template __global__ void dy_mf_update_kernel(Table* table, + const OptimizerConfig& optimizer_config, const typename Table::key_type* const keys, const char* const grads, size_t len, Sgd sgd, size_t grad_value_size) { @@ -117,7 +119,7 @@ __global__ void dy_mf_update_kernel(Table* table, auto it = table->find(keys[i]); if (it != table->end()) { FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size); - sgd.dy_mf_update_value((it.getter())->second, *cur); + sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur); } else { printf("yxf::push miss key: %d", keys[i]); } @@ -127,6 +129,9 @@ __global__ void dy_mf_update_kernel(Table* table, template HashTable::HashTable(size_t capacity) { container_ = new TableContainer(capacity); + cudaMalloc((void**)&device_optimizer_config_, sizeof(OptimizerConfig)); + cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_, + sizeof(OptimizerConfig), cudaMemcpyHostToDevice); rwlock_.reset(new phi::RWLock); } @@ -135,6 +140,22 @@ HashTable::~HashTable() { delete container_; } +template +void HashTable::set_sparse_sgd( + const OptimizerConfig& optimizer_config) { + host_optimizer_config_.set_sparse_sgd(optimizer_config); + cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_, + sizeof(OptimizerConfig), cudaMemcpyHostToDevice); +} + +template +void HashTable::set_embedx_sgd( + const OptimizerConfig& optimizer_config) { + host_optimizer_config_.set_embedx_sgd(optimizer_config); + cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_, + sizeof(OptimizerConfig), cudaMemcpyHostToDevice); +} + template void HashTable::show() { container_->print(); @@ -279,8 +300,8 @@ void HashTable::update(const KeyType* d_keys, return; } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; - update_kernel<<>>(container_, d_keys, - d_grads, len, sgd); + update_kernel<<>>( + container_, *device_optimizer_config_, d_keys, d_grads, len, sgd); } template @@ -293,7 +314,8 @@ void HashTable::update(const KeyType* d_keys, } const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; dy_mf_update_kernel<<>>( - container_, d_keys, d_grads, len, sgd, push_grad_value_size_); + container_, *device_optimizer_config_, d_keys, d_grads, len, sgd, + push_grad_value_size_); } template class HashTable; diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps index cd43a73b44e..79c5f3d7577 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_kernel.kps @@ -163,7 +163,7 @@ __global__ void search_kernel(Table& table, const KeyType* const keys, } template -__global__ void update_kernel(OptimizerConfig& optimizer_config, Table& table, +__global__ void update_kernel(Table& table, OptimizerConfig& optimizer_config, const KeyType* const keys, const GradType* const grads, long long len) { int cid = core_id(); @@ -202,12 +202,9 @@ HashTable::HashTable(size_t capacity) { sizeof(XPUCacheArray)); xpu_memcpy((void*)container_, &tmp_container, sizeof(XPUCacheArray), XPU_HOST_TO_DEVICE); - - OptimizerConfig tmp_opt_config; - xpu_malloc(reinterpret_cast(&xpu_optimizer_config_), + xpu_malloc(reinterpret_cast(&device_optimizer_config_), sizeof(OptimizerConfig)); - - xpu_memcpy((void*)xpu_optimizer_config_, &tmp_opt_config, + xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_, sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE); rwlock_.reset(new phi::RWLock); @@ -216,7 +213,7 @@ HashTable::HashTable(size_t capacity) { template HashTable::~HashTable() { xpu_free((void*)container_); - xpu_free((void*)xpu_optimizer_config_); + xpu_free((void*)device_optimizer_config_); } template @@ -227,28 +224,16 @@ void HashTable::show() { template void HashTable::set_sparse_sgd( const OptimizerConfig& optimizer_config) { - cpu_optimizer_config_.nonclk_coeff = optimizer_config.nonclk_coeff; - cpu_optimizer_config_.clk_coeff = optimizer_config.clk_coeff; - cpu_optimizer_config_.min_bound = optimizer_config.min_bound; - cpu_optimizer_config_.max_bound = optimizer_config.max_bound; - cpu_optimizer_config_.learning_rate = optimizer_config.learning_rate; - cpu_optimizer_config_.initial_g2sum = optimizer_config.initial_g2sum; - cpu_optimizer_config_.initial_range = optimizer_config.initial_range; - xpu_memcpy((void*)xpu_optimizer_config_, &cpu_optimizer_config_, + host_optimizer_config_.set_sparse_sgd(optimizer_config); + xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_, sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE); } template void HashTable::set_embedx_sgd( const OptimizerConfig& optimizer_config) { - cpu_optimizer_config_.mf_create_thresholds = - optimizer_config.mf_create_thresholds; - cpu_optimizer_config_.mf_learning_rate = optimizer_config.mf_learning_rate; - cpu_optimizer_config_.mf_initial_g2sum = optimizer_config.mf_initial_g2sum; - cpu_optimizer_config_.mf_initial_range = optimizer_config.mf_initial_range; - cpu_optimizer_config_.mf_min_bound = optimizer_config.mf_min_bound; - cpu_optimizer_config_.mf_max_bound = optimizer_config.mf_max_bound; - xpu_memcpy((void*)xpu_optimizer_config_, &cpu_optimizer_config_, + host_optimizer_config_.set_embedx_sgd(optimizer_config); + xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_, sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE); } @@ -306,7 +291,7 @@ void HashTable::update(const KeyType* d_keys, long long c_len = (long long)len; update_kernel, GradType><<<4, 64, stream>>>( - *xpu_optimizer_config_, *container_, d_keys, d_grads, c_len); + *container_, *device_optimizer_config_, d_keys, d_grads, c_len); } template diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h index 6379f7ee912..e53a962c5ab 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm.h @@ -65,10 +65,8 @@ class HeterComm { void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len); #endif -#if defined(PADDLE_WITH_XPU_KP) void set_sparse_sgd(const OptimizerConfig& optimizer_config); void set_embedx_sgd(const OptimizerConfig& optimizer_config); -#endif int log2i(int x); diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h index 098adc2bdeb..2a4f535ef70 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h @@ -342,7 +342,6 @@ int HeterComm::get_index_by_devid(int devid) { return resource_->get_index_by_devid(devid); } -#if defined(PADDLE_WITH_XPU_KP) template void HeterComm::set_sparse_sgd( const OptimizerConfig& optimizer_config) { @@ -358,7 +357,6 @@ void HeterComm::set_embedx_sgd( table->set_embedx_sgd(optimizer_config); } } -#endif template void HeterComm::build_ps( diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu index 581b0d511c2..66e06b13b04 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.cu @@ -48,6 +48,14 @@ int HeterPs::get_index_by_devid(int devid) { return comm_->get_index_by_devid(devid); } +void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) { + comm_->set_sparse_sgd(optimizer_config); +} + +void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) { + comm_->set_embedx_sgd(optimizer_config); +} + void HeterPs::end_pass() { comm_->end_pass(); } void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h index 7060817be91..70b88350f27 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps.h @@ -44,10 +44,8 @@ class HeterPs : public HeterPsBase { int comm_size) override; #endif -#if defined(PADDLE_WITH_XPU_KP) void set_sparse_sgd(const OptimizerConfig& optimizer_config) override; void set_embedx_sgd(const OptimizerConfig& optimizer_config) override; -#endif void end_pass() override; int get_index_by_devid(int devid) override; diff --git a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h index 79061ab66af..0727e2c2dbc 100644 --- a/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h +++ b/paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h @@ -16,9 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" #include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h" -#if defined(PADDLE_WITH_XPU_KP) #include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h" -#endif #ifdef PADDLE_WITH_HETERPS @@ -48,10 +46,8 @@ class HeterPsBase { virtual void push_sparse(int num, FeatureKey* d_keys, FeaturePushValue* d_grads, size_t len) = 0; -#if defined(PADDLE_WITH_XPU_KP) - virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) {} - virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) {} -#endif + virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0; + virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0; static HeterPsBase* get_instance(size_t capacity, std::shared_ptr resource); diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h index ebf7dd277c7..065d5e6d527 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer.cuh.h @@ -35,58 +35,64 @@ class Optimizer { void initialize() {} - __device__ void update_lr(float& w, float& g2sum, float g, // NOLINT + __device__ void update_lr(const OptimizerConfig& optimizer_config, + float& w, // NOLINT + float& g2sum, float g, // NOLINT float scale) { double add_g2sum = 0; - double ratio = optimizer_config::learning_rate * - sqrt(optimizer_config::initial_g2sum / - (optimizer_config::initial_g2sum + g2sum)); + double ratio = optimizer_config.learning_rate * + sqrt(optimizer_config.initial_g2sum / + (optimizer_config.initial_g2sum + g2sum)); double scaled_grad = g / scale; w += scaled_grad * ratio; - if (w < optimizer_config::min_bound) w = optimizer_config::min_bound; - if (w > optimizer_config::max_bound) w = optimizer_config::max_bound; + if (w < optimizer_config.min_bound) w = optimizer_config.min_bound; + if (w > optimizer_config.max_bound) w = optimizer_config.max_bound; add_g2sum += scaled_grad * scaled_grad; g2sum += add_g2sum; } - __device__ void update_mf(int n, float* w, float& g2sum, // NOLINT + __device__ void update_mf(const OptimizerConfig& optimizer_config, int n, + float* w, + float& g2sum, // NOLINT const float* g, float scale) { double add_g2sum = 0; - double ratio = optimizer_config::mf_learning_rate * - sqrt(optimizer_config::mf_initial_g2sum / - (optimizer_config::mf_initial_g2sum + g2sum)); + double ratio = optimizer_config.mf_learning_rate * + sqrt(optimizer_config.mf_initial_g2sum / + (optimizer_config.mf_initial_g2sum + g2sum)); for (int i = 0; i < n; ++i) { double scaled_grad = g[i] / scale; w[i] += scaled_grad * ratio; - if (w[i] < optimizer_config::mf_min_bound) - w[i] = optimizer_config::mf_min_bound; - if (w[i] > optimizer_config::mf_max_bound) - w[i] = optimizer_config::mf_max_bound; + if (w[i] < optimizer_config.mf_min_bound) + w[i] = optimizer_config.mf_min_bound; + if (w[i] > optimizer_config.mf_max_bound) + w[i] = optimizer_config.mf_max_bound; add_g2sum += scaled_grad * scaled_grad; } g2sum += add_g2sum / n; } - __device__ void update_value(ValType& val, const GradType& grad) { // NOLINT + __device__ void update_value(const OptimizerConfig& optimizer_config, + ValType& val, // NOLINT + const GradType& grad) { val.slot = grad.slot; val.show += grad.show; val.clk += grad.clk; - val.delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) + - optimizer_config::clk_coeff * grad.clk; + val.delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) + + optimizer_config.clk_coeff * grad.clk; - update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show); + update_lr(optimizer_config, val.lr, val.lr_g2sum, grad.lr_g, grad.show); if (val.mf_size == 0) { - if (optimizer_config::mf_create_thresholds <= - optimizer_config::nonclk_coeff * (val.show - val.clk) + - optimizer_config::clk_coeff * val.clk) { + if (optimizer_config.mf_create_thresholds <= + optimizer_config.nonclk_coeff * (val.show - val.clk) + + optimizer_config.clk_coeff * val.clk) { val.mf_size = MF_DIM + 1; val.mf[0] = 0; int tid_x = blockIdx.x * blockDim.x + threadIdx.x; @@ -94,30 +100,31 @@ class Optimizer { curand_init(clock64(), tid_x, 0, &state); for (int i = 0; i < MF_DIM; ++i) { val.mf[i + 1] = - (curand_uniform(&state)) * optimizer_config::mf_initial_range; + (curand_uniform(&state)) * optimizer_config.mf_initial_range; } } } else { - update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show); + update_mf(optimizer_config, MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, + grad.show); } } - __device__ void dy_mf_update_value(ValType* ptr, const GradType& grad) { + __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, + ValType* ptr, const GradType& grad) { ptr->slot = grad.slot; ptr->show += grad.show; ptr->clk += grad.clk; - ptr->delta_score += - optimizer_config::nonclk_coeff * (grad.show - grad.clk) + - optimizer_config::clk_coeff * grad.clk; + ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) + + optimizer_config.clk_coeff * grad.clk; - update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); + update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); // use MF_DIM temporarily // ptr->mf_dim = grad.mf_dim; if (ptr->mf_size == 0) { - if (optimizer_config::mf_create_thresholds <= - optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) + - optimizer_config::clk_coeff * ptr->clk) { + if (optimizer_config.mf_create_thresholds <= + optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) + + optimizer_config.clk_coeff * ptr->clk) { // ptr->mf_size = ptr->mf_dim + 1; ptr->mf_size = MF_DIM + 1; @@ -127,11 +134,11 @@ class Optimizer { curand_init(clock64(), tid_x, 0, &state); for (int i = 0; i < MF_DIM; ++i) { ptr->mf[i + 1] = - (curand_uniform(&state)) * optimizer_config::mf_initial_range; + (curand_uniform(&state)) * optimizer_config.mf_initial_range; } } } else { - update_mf(MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g, + update_mf(optimizer_config, MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g, grad.show); // for local test } } diff --git a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h index 2a80aa4b52d..03caeb984f7 100644 --- a/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h +++ b/paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h @@ -14,50 +14,69 @@ limitations under the License. */ #pragma once -#if defined(PADDLE_WITH_CUDA) +namespace paddle { +namespace framework { -namespace optimizer_config { +class OptimizerConfig { + public: + float nonclk_coeff = 0.1; + float clk_coeff = 1; -__constant__ float nonclk_coeff = 0.1; -__constant__ float clk_coeff = 1; + float min_bound = -10; + float max_bound = 10; + float learning_rate = 0.05; + float initial_g2sum = 3.0; + float initial_range = 0; -__constant__ float min_bound = -10; -__constant__ float max_bound = 10; -__constant__ float learning_rate = 0.05; -__constant__ float initial_g2sum = 3.0; -__constant__ float initial_range = 0; + float mf_create_thresholds = 10; + float mf_learning_rate = 0.05; + float mf_initial_g2sum = 3.0; + float mf_initial_range = 1e-4; + float mf_min_bound = -10; + float mf_max_bound = 10; -__constant__ float mf_create_thresholds = 10; -__constant__ float mf_learning_rate = 0.05; -__constant__ float mf_initial_g2sum = 3.0; -__constant__ float mf_initial_range = 1e-4; -__constant__ float mf_min_bound = -10; -__constant__ float mf_max_bound = 10; -} // namespace optimizer_config + void set_sparse_sgd(float nonclk_coeff, float clk_coeff, float min_bound, + float max_bound, float learning_rate, float initial_g2sum, + float initial_range) { + this->nonclk_coeff = nonclk_coeff; + this->clk_coeff = clk_coeff; + this->min_bound = min_bound; + this->max_bound = max_bound; + this->learning_rate = learning_rate; + this->initial_g2sum = initial_g2sum; + this->initial_range = initial_range; + } -#elif defined(PADDLE_WITH_XPU_KP) -namespace paddle { -namespace framework { + void set_sparse_sgd(const OptimizerConfig& optimizer_config) { + this->nonclk_coeff = optimizer_config.nonclk_coeff; + this->clk_coeff = optimizer_config.clk_coeff; + this->min_bound = optimizer_config.min_bound; + this->max_bound = optimizer_config.max_bound; + this->learning_rate = optimizer_config.learning_rate; + this->initial_g2sum = optimizer_config.initial_g2sum; + this->initial_range = optimizer_config.initial_range; + } -class OptimizerConfig { - public: - float nonclk_coeff; - float clk_coeff; - - float min_bound; - float max_bound; - float learning_rate; - float initial_g2sum; - float initial_range; - - float mf_create_thresholds; - float mf_learning_rate; - float mf_initial_g2sum; - float mf_initial_range; - float mf_min_bound; - float mf_max_bound; + void set_embedx_sgd(float mf_create_thresholds, float mf_learning_rate, + float mf_initial_g2sum, float mf_initial_range, + float mf_min_bound, float mf_max_bound) { + this->mf_create_thresholds = mf_create_thresholds; + this->mf_learning_rate = mf_learning_rate; + this->mf_initial_g2sum = mf_initial_g2sum; + this->mf_initial_range = mf_initial_range; + this->mf_min_bound = mf_min_bound; + this->mf_max_bound = mf_max_bound; + } + + void set_embedx_sgd(const OptimizerConfig& optimizer_config) { + this->mf_create_thresholds = optimizer_config.mf_create_thresholds; + this->mf_learning_rate = optimizer_config.mf_learning_rate; + this->mf_initial_g2sum = optimizer_config.mf_initial_g2sum; + this->mf_initial_range = optimizer_config.mf_initial_range; + this->mf_min_bound = optimizer_config.mf_min_bound; + this->mf_max_bound = optimizer_config.mf_max_bound; + } }; + } // namespace framework } // namespace paddle - -#endif diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index cf7d98db27e..3df5a4b4738 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -181,35 +181,21 @@ void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float min_bound, float max_bound, float learning_rate, float initial_g2sum, float initial_range) { - cudaMemcpyToSymbol(optimizer_config::nonclk_coeff, &nonclk_coeff, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::clk_coeff, &clk_coeff, sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::min_bound, &min_bound, sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::max_bound, &max_bound, sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::learning_rate, &learning_rate, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::initial_g2sum, &initial_g2sum, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::initial_range, &initial_range, - sizeof(float)); + OptimizerConfig optimizer_config; + optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound, + learning_rate, initial_g2sum, initial_range); + HeterPs_->set_sparse_sgd(optimizer_config); } void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, float mf_learning_rate, float mf_initial_g2sum, float mf_initial_range, float mf_min_bound, float mf_max_bound) { - cudaMemcpyToSymbol(optimizer_config::mf_create_thresholds, - &mf_create_thresholds, sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::mf_learning_rate, &mf_learning_rate, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::mf_initial_g2sum, &mf_initial_g2sum, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::mf_initial_range, &mf_initial_range, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::mf_min_bound, &mf_min_bound, - sizeof(float)); - cudaMemcpyToSymbol(optimizer_config::mf_max_bound, &mf_max_bound, - sizeof(float)); + OptimizerConfig optimizer_config; + optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate, + mf_initial_g2sum, mf_initial_range, + mf_min_bound, mf_max_bound); + HeterPs_->set_embedx_sgd(optimizer_config); } } // end namespace framework diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps index 571a090b9b4..28dd873a117 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps @@ -256,13 +256,8 @@ void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff, float learning_rate, float initial_g2sum, float initial_range) { OptimizerConfig optimizer_config; - optimizer_config.nonclk_coeff = nonclk_coeff; - optimizer_config.clk_coeff = clk_coeff; - optimizer_config.min_bound = min_bound; - optimizer_config.max_bound = max_bound; - optimizer_config.learning_rate = learning_rate; - optimizer_config.initial_g2sum = initial_g2sum; - optimizer_config.initial_range = initial_range; + optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound, + learning_rate, initial_g2sum, initial_range); HeterPs_->set_sparse_sgd(optimizer_config); } @@ -271,12 +266,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, float mf_initial_range, float mf_min_bound, float mf_max_bound) { OptimizerConfig optimizer_config; - optimizer_config.mf_create_thresholds = mf_create_thresholds; - optimizer_config.mf_learning_rate = mf_learning_rate; - optimizer_config.mf_initial_g2sum = mf_initial_g2sum; - optimizer_config.mf_initial_range = mf_initial_range; - optimizer_config.mf_min_bound = mf_min_bound; - optimizer_config.mf_max_bound = mf_max_bound; + optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate, + mf_initial_g2sum, mf_initial_range, + mf_min_bound, mf_max_bound); HeterPs_->set_embedx_sgd(optimizer_config); } diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 9b12870a2bb..aec40a5a7eb 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -95,8 +95,46 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, return; } +void add_sparse_optimizer( + std::unordered_map& config, // NOLINT + const ::paddle::SparseCommonSGDRuleParameter& sgd_param, + const std::string& prefix = "") { + auto optimizer_name = sgd_param.name(); + if (optimizer_name == "naive") { + config[prefix + "learning_rate"] = sgd_param.naive().learning_rate(); + config[prefix + "initial_range"] = sgd_param.naive().initial_range(); + if (sgd_param.naive().weight_bounds_size() == 2) { + config[prefix + "min_bound"] = sgd_param.naive().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.naive().weight_bounds()[1]; + } + } else if (optimizer_name == "adagrad") { + config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adagrad().initial_range(); + config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum(); + if (sgd_param.adagrad().weight_bounds_size() == 2) { + config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1]; + } + } else if (optimizer_name == "std_adagrad") { + config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adagrad().initial_range(); + config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum(); + if (sgd_param.adagrad().weight_bounds_size() == 2) { + config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1]; + } + } else if (optimizer_name == "adam") { + config[prefix + "learning_rate"] = sgd_param.adam().learning_rate(); + config[prefix + "initial_range"] = sgd_param.adam().initial_range(); + if (sgd_param.adam().weight_bounds_size() == 2) { + config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0]; + config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1]; + } + } +} + void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { - // add for hbmps optimizer config + // optimizer config for hbmps auto fleet_desc_str = trainer_desc.fleet_desc(); google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param); auto sparse_table = @@ -105,7 +143,7 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { auto sparse_table_accessor_parameter = sparse_table_accessor.downpour_accessor_param(); auto accessor_class = sparse_table_accessor.accessor_class(); - // gpups' sparse table optimizer config + // NOTE(zhangminxu): gpups' sparse table optimizer config, // now only support single sparse table // auto sparse_table = param_.sparse_table(0); std::unordered_map config; @@ -126,7 +164,14 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { config["max_bound"] = sparse_table_accessor.sparse_sgd_param().weight_bounds()[1]; } + // NOTE(zhangminxu): for DownpourCtrAccessor & DownpourCtrDoubleAccessor, + // optimizer config for embed_w & embedx_w is the same config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); + config["mf_learning_rate"] = config["learning_rate"]; + config["mf_initial_g2sum"] = config["initial_g2sum"]; + config["mf_initial_range"] = config["initial_range"]; + config["mf_min_bound"] = config["min_bound"]; + config["mf_max_bound"] = config["max_bound"]; } else if (accessor_class == "DownpourSparseValueAccessor") { auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name(); if (optimizer_name == "naive") { @@ -186,71 +231,12 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { accessor_class == "DownpourDoubleUnitAccessor") { config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff(); config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff(); - auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name(); - if (optimizer_name == "naive") { - config["mf_learning_rate"] = - sparse_table_accessor.embedx_sgd_param().naive().learning_rate(); - config["mf_initial_range"] = - sparse_table_accessor.embedx_sgd_param().naive().initial_range(); - if (sparse_table_accessor.embedx_sgd_param() - .naive() - .weight_bounds_size() == 2) { - config["mf_min_bound"] = - sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0]; - config["mf_max_bound"] = - sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1]; - } - } else if (optimizer_name == "adagrad") { - config["mf_learning_rate"] = - sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate(); - config["mf_initial_range"] = - sparse_table_accessor.embedx_sgd_param().adagrad().initial_range(); - config["mf_initial_g2sum"] = - sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum(); - if (sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds_size() == 2) { - config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds()[0]; - config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds()[1]; - } - } else if (optimizer_name == "std_adagrad") { - config["mf_learning_rate"] = - sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate(); - config["mf_initial_range"] = - sparse_table_accessor.embedx_sgd_param().adagrad().initial_range(); - config["mf_initial_g2sum"] = - sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum(); - if (sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds_size() == 2) { - config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds()[0]; - config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param() - .adagrad() - .weight_bounds()[1]; - } - } else if (optimizer_name == "adam") { - config["mf_learning_rate"] = - sparse_table_accessor.embedx_sgd_param().adam().learning_rate(); - config["mf_initial_range"] = - sparse_table_accessor.embedx_sgd_param().adam().initial_range(); - if (sparse_table_accessor.embedx_sgd_param() - .adam() - .weight_bounds_size() == 2) { - config["mf_min_bound"] = - sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0]; - config["mf_max_bound"] = - sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1]; - } - } config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold(); + // optimizer config for embed_w and embedx + add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param()); + add_sparse_optimizer(config, sparse_table_accessor.embedx_sgd_param(), + "mf_"); } - auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance(); ps_gpu_wrapper->InitializeGPUServer(config); } diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 2496d4d040e..b86b4fec8a5 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -37,7 +37,7 @@ limitations under the License. */ #include "paddle/phi/backends/dynload/port.h" #ifdef PADDLE_WITH_PSLIB -#include +#include "proto/ps.pb.h" #endif namespace paddle { -- GitLab