未验证 提交 0ce42fb0 编写于 作者: Z zmxdream 提交者: GitHub

merge develop. test=develop (#42624)

上级 21b35167
......@@ -41,9 +41,7 @@ limitations under the License. */
#include "xpu/kernel/simd.h"
#endif
#if defined(PADDLE_WITH_XPU_KP)
#include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
#endif
namespace paddle {
namespace framework {
......@@ -132,10 +130,8 @@ class HashTable {
void show();
#if defined(PADDLE_WITH_XPU_KP)
void set_sparse_sgd(const OptimizerConfig& optimizer_config);
void set_embedx_sgd(const OptimizerConfig& optimizer_config);
#endif
template <typename StreamType>
void dump_to_cpu(int devid, StreamType stream);
......@@ -178,9 +174,10 @@ class HashTable {
TableContainer<KeyType, ValType>* container_;
#elif defined(PADDLE_WITH_XPU_KP)
XPUCacheArray<KeyType, ValType>* container_;
OptimizerConfig* xpu_optimizer_config_;
OptimizerConfig cpu_optimizer_config_;
#endif
OptimizerConfig* device_optimizer_config_;
OptimizerConfig host_optimizer_config_;
int BLOCK_SIZE_{256};
float LOAD_FACTOR{0.75f};
size_t capacity_;
......
......@@ -95,6 +95,7 @@ __global__ void dy_mf_search_kernel(Table* table,
template <typename Table, typename GradType, typename Sgd>
__global__ void update_kernel(Table* table,
const OptimizerConfig& optimizer_config,
const typename Table::key_type* const keys,
const GradType* const grads, size_t len,
Sgd sgd) {
......@@ -102,13 +103,14 @@ __global__ void update_kernel(Table* table,
if (i < len) {
auto it = table->find(keys[i]);
if (it != table->end()) {
sgd.update_value((it.getter())->second, grads[i]);
sgd.update_value(optimizer_config, (it.getter())->second, grads[i]);
}
}
}
template <typename Table, typename Sgd>
__global__ void dy_mf_update_kernel(Table* table,
const OptimizerConfig& optimizer_config,
const typename Table::key_type* const keys,
const char* const grads, size_t len,
Sgd sgd, size_t grad_value_size) {
......@@ -117,7 +119,7 @@ __global__ void dy_mf_update_kernel(Table* table,
auto it = table->find(keys[i]);
if (it != table->end()) {
FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size);
sgd.dy_mf_update_value((it.getter())->second, *cur);
sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur);
} else {
printf("yxf::push miss key: %d", keys[i]);
}
......@@ -127,6 +129,9 @@ __global__ void dy_mf_update_kernel(Table* table,
template <typename KeyType, typename ValType>
HashTable<KeyType, ValType>::HashTable(size_t capacity) {
container_ = new TableContainer<KeyType, ValType>(capacity);
cudaMalloc((void**)&device_optimizer_config_, sizeof(OptimizerConfig));
cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_,
sizeof(OptimizerConfig), cudaMemcpyHostToDevice);
rwlock_.reset(new phi::RWLock);
}
......@@ -135,6 +140,22 @@ HashTable<KeyType, ValType>::~HashTable() {
delete container_;
}
template <typename KeyType, typename ValType>
void HashTable<KeyType, ValType>::set_sparse_sgd(
const OptimizerConfig& optimizer_config) {
host_optimizer_config_.set_sparse_sgd(optimizer_config);
cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_,
sizeof(OptimizerConfig), cudaMemcpyHostToDevice);
}
template <typename KeyType, typename ValType>
void HashTable<KeyType, ValType>::set_embedx_sgd(
const OptimizerConfig& optimizer_config) {
host_optimizer_config_.set_embedx_sgd(optimizer_config);
cudaMemcpy((void*)device_optimizer_config_, &host_optimizer_config_,
sizeof(OptimizerConfig), cudaMemcpyHostToDevice);
}
template <typename KeyType, typename ValType>
void HashTable<KeyType, ValType>::show() {
container_->print();
......@@ -279,8 +300,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
return;
}
const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(container_, d_keys,
d_grads, len, sgd);
update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
container_, *device_optimizer_config_, d_keys, d_grads, len, sgd);
}
template <typename KeyType, typename ValType>
......@@ -293,7 +314,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
}
const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
dy_mf_update_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
container_, d_keys, d_grads, len, sgd, push_grad_value_size_);
container_, *device_optimizer_config_, d_keys, d_grads, len, sgd,
push_grad_value_size_);
}
template class HashTable<unsigned long, paddle::framework::FeatureValue>;
......
......@@ -163,7 +163,7 @@ __global__ void search_kernel(Table& table, const KeyType* const keys,
}
template <typename KeyType, typename ValType, typename Table, typename GradType>
__global__ void update_kernel(OptimizerConfig& optimizer_config, Table& table,
__global__ void update_kernel(Table& table, OptimizerConfig& optimizer_config,
const KeyType* const keys,
const GradType* const grads, long long len) {
int cid = core_id();
......@@ -202,12 +202,9 @@ HashTable<KeyType, ValType>::HashTable(size_t capacity) {
sizeof(XPUCacheArray<KeyType, ValType>));
xpu_memcpy((void*)container_, &tmp_container,
sizeof(XPUCacheArray<KeyType, ValType>), XPU_HOST_TO_DEVICE);
OptimizerConfig tmp_opt_config;
xpu_malloc(reinterpret_cast<void**>(&xpu_optimizer_config_),
xpu_malloc(reinterpret_cast<void**>(&device_optimizer_config_),
sizeof(OptimizerConfig));
xpu_memcpy((void*)xpu_optimizer_config_, &tmp_opt_config,
xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_,
sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE);
rwlock_.reset(new phi::RWLock);
......@@ -216,7 +213,7 @@ HashTable<KeyType, ValType>::HashTable(size_t capacity) {
template <typename KeyType, typename ValType>
HashTable<KeyType, ValType>::~HashTable() {
xpu_free((void*)container_);
xpu_free((void*)xpu_optimizer_config_);
xpu_free((void*)device_optimizer_config_);
}
template <typename KeyType, typename ValType>
......@@ -227,28 +224,16 @@ void HashTable<KeyType, ValType>::show() {
template <typename KeyType, typename ValType>
void HashTable<KeyType, ValType>::set_sparse_sgd(
const OptimizerConfig& optimizer_config) {
cpu_optimizer_config_.nonclk_coeff = optimizer_config.nonclk_coeff;
cpu_optimizer_config_.clk_coeff = optimizer_config.clk_coeff;
cpu_optimizer_config_.min_bound = optimizer_config.min_bound;
cpu_optimizer_config_.max_bound = optimizer_config.max_bound;
cpu_optimizer_config_.learning_rate = optimizer_config.learning_rate;
cpu_optimizer_config_.initial_g2sum = optimizer_config.initial_g2sum;
cpu_optimizer_config_.initial_range = optimizer_config.initial_range;
xpu_memcpy((void*)xpu_optimizer_config_, &cpu_optimizer_config_,
host_optimizer_config_.set_sparse_sgd(optimizer_config);
xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_,
sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE);
}
template <typename KeyType, typename ValType>
void HashTable<KeyType, ValType>::set_embedx_sgd(
const OptimizerConfig& optimizer_config) {
cpu_optimizer_config_.mf_create_thresholds =
optimizer_config.mf_create_thresholds;
cpu_optimizer_config_.mf_learning_rate = optimizer_config.mf_learning_rate;
cpu_optimizer_config_.mf_initial_g2sum = optimizer_config.mf_initial_g2sum;
cpu_optimizer_config_.mf_initial_range = optimizer_config.mf_initial_range;
cpu_optimizer_config_.mf_min_bound = optimizer_config.mf_min_bound;
cpu_optimizer_config_.mf_max_bound = optimizer_config.mf_max_bound;
xpu_memcpy((void*)xpu_optimizer_config_, &cpu_optimizer_config_,
host_optimizer_config_.set_embedx_sgd(optimizer_config);
xpu_memcpy((void*)device_optimizer_config_, &host_optimizer_config_,
sizeof(OptimizerConfig), XPU_HOST_TO_DEVICE);
}
......@@ -306,7 +291,7 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
long long c_len = (long long)len;
update_kernel<KeyType, ValType, XPUCacheArray<KeyType, ValType>,
GradType><<<4, 64, stream>>>(
*xpu_optimizer_config_, *container_, d_keys, d_grads, c_len);
*container_, *device_optimizer_config_, d_keys, d_grads, c_len);
}
template <typename KeyType, typename ValType>
......
......@@ -65,10 +65,8 @@ class HeterComm {
void push_sparse(int num, KeyType* d_keys, GradType* d_grads, size_t len);
#endif
#if defined(PADDLE_WITH_XPU_KP)
void set_sparse_sgd(const OptimizerConfig& optimizer_config);
void set_embedx_sgd(const OptimizerConfig& optimizer_config);
#endif
int log2i(int x);
......
......@@ -342,7 +342,6 @@ int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
return resource_->get_index_by_devid(devid);
}
#if defined(PADDLE_WITH_XPU_KP)
template <typename KeyType, typename ValType, typename GradType>
void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
const OptimizerConfig& optimizer_config) {
......@@ -358,7 +357,6 @@ void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
table->set_embedx_sgd(optimizer_config);
}
}
#endif
template <typename KeyType, typename ValType, typename GradType>
void HeterComm<KeyType, ValType, GradType>::build_ps(
......
......@@ -48,6 +48,14 @@ int HeterPs::get_index_by_devid(int devid) {
return comm_->get_index_by_devid(devid);
}
void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) {
comm_->set_sparse_sgd(optimizer_config);
}
void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) {
comm_->set_embedx_sgd(optimizer_config);
}
void HeterPs::end_pass() { comm_->end_pass(); }
void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
......
......@@ -44,10 +44,8 @@ class HeterPs : public HeterPsBase {
int comm_size) override;
#endif
#if defined(PADDLE_WITH_XPU_KP)
void set_sparse_sgd(const OptimizerConfig& optimizer_config) override;
void set_embedx_sgd(const OptimizerConfig& optimizer_config) override;
#endif
void end_pass() override;
int get_index_by_devid(int devid) override;
......
......@@ -16,9 +16,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
#if defined(PADDLE_WITH_XPU_KP)
#include "paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h"
#endif
#ifdef PADDLE_WITH_HETERPS
......@@ -48,10 +46,8 @@ class HeterPsBase {
virtual void push_sparse(int num, FeatureKey* d_keys,
FeaturePushValue* d_grads, size_t len) = 0;
#if defined(PADDLE_WITH_XPU_KP)
virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) {}
virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) {}
#endif
virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0;
virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0;
static HeterPsBase* get_instance(size_t capacity,
std::shared_ptr<HeterPsResource> resource);
......
......@@ -35,58 +35,64 @@ class Optimizer {
void initialize() {}
__device__ void update_lr(float& w, float& g2sum, float g, // NOLINT
__device__ void update_lr(const OptimizerConfig& optimizer_config,
float& w, // NOLINT
float& g2sum, float g, // NOLINT
float scale) {
double add_g2sum = 0;
double ratio = optimizer_config::learning_rate *
sqrt(optimizer_config::initial_g2sum /
(optimizer_config::initial_g2sum + g2sum));
double ratio = optimizer_config.learning_rate *
sqrt(optimizer_config.initial_g2sum /
(optimizer_config.initial_g2sum + g2sum));
double scaled_grad = g / scale;
w += scaled_grad * ratio;
if (w < optimizer_config::min_bound) w = optimizer_config::min_bound;
if (w > optimizer_config::max_bound) w = optimizer_config::max_bound;
if (w < optimizer_config.min_bound) w = optimizer_config.min_bound;
if (w > optimizer_config.max_bound) w = optimizer_config.max_bound;
add_g2sum += scaled_grad * scaled_grad;
g2sum += add_g2sum;
}
__device__ void update_mf(int n, float* w, float& g2sum, // NOLINT
__device__ void update_mf(const OptimizerConfig& optimizer_config, int n,
float* w,
float& g2sum, // NOLINT
const float* g, float scale) {
double add_g2sum = 0;
double ratio = optimizer_config::mf_learning_rate *
sqrt(optimizer_config::mf_initial_g2sum /
(optimizer_config::mf_initial_g2sum + g2sum));
double ratio = optimizer_config.mf_learning_rate *
sqrt(optimizer_config.mf_initial_g2sum /
(optimizer_config.mf_initial_g2sum + g2sum));
for (int i = 0; i < n; ++i) {
double scaled_grad = g[i] / scale;
w[i] += scaled_grad * ratio;
if (w[i] < optimizer_config::mf_min_bound)
w[i] = optimizer_config::mf_min_bound;
if (w[i] > optimizer_config::mf_max_bound)
w[i] = optimizer_config::mf_max_bound;
if (w[i] < optimizer_config.mf_min_bound)
w[i] = optimizer_config.mf_min_bound;
if (w[i] > optimizer_config.mf_max_bound)
w[i] = optimizer_config.mf_max_bound;
add_g2sum += scaled_grad * scaled_grad;
}
g2sum += add_g2sum / n;
}
__device__ void update_value(ValType& val, const GradType& grad) { // NOLINT
__device__ void update_value(const OptimizerConfig& optimizer_config,
ValType& val, // NOLINT
const GradType& grad) {
val.slot = grad.slot;
val.show += grad.show;
val.clk += grad.clk;
val.delta_score += optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
optimizer_config::clk_coeff * grad.clk;
val.delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) +
optimizer_config.clk_coeff * grad.clk;
update_lr(val.lr, val.lr_g2sum, grad.lr_g, grad.show);
update_lr(optimizer_config, val.lr, val.lr_g2sum, grad.lr_g, grad.show);
if (val.mf_size == 0) {
if (optimizer_config::mf_create_thresholds <=
optimizer_config::nonclk_coeff * (val.show - val.clk) +
optimizer_config::clk_coeff * val.clk) {
if (optimizer_config.mf_create_thresholds <=
optimizer_config.nonclk_coeff * (val.show - val.clk) +
optimizer_config.clk_coeff * val.clk) {
val.mf_size = MF_DIM + 1;
val.mf[0] = 0;
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
......@@ -94,30 +100,31 @@ class Optimizer {
curand_init(clock64(), tid_x, 0, &state);
for (int i = 0; i < MF_DIM; ++i) {
val.mf[i + 1] =
(curand_uniform(&state)) * optimizer_config::mf_initial_range;
(curand_uniform(&state)) * optimizer_config.mf_initial_range;
}
}
} else {
update_mf(MF_DIM, &val.mf[1], val.mf[0], grad.mf_g, grad.show);
update_mf(optimizer_config, MF_DIM, &val.mf[1], val.mf[0], grad.mf_g,
grad.show);
}
}
__device__ void dy_mf_update_value(ValType* ptr, const GradType& grad) {
__device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
ValType* ptr, const GradType& grad) {
ptr->slot = grad.slot;
ptr->show += grad.show;
ptr->clk += grad.clk;
ptr->delta_score +=
optimizer_config::nonclk_coeff * (grad.show - grad.clk) +
optimizer_config::clk_coeff * grad.clk;
ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) +
optimizer_config.clk_coeff * grad.clk;
update_lr(ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show);
// use MF_DIM temporarily
// ptr->mf_dim = grad.mf_dim;
if (ptr->mf_size == 0) {
if (optimizer_config::mf_create_thresholds <=
optimizer_config::nonclk_coeff * (ptr->show - ptr->clk) +
optimizer_config::clk_coeff * ptr->clk) {
if (optimizer_config.mf_create_thresholds <=
optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) +
optimizer_config.clk_coeff * ptr->clk) {
// ptr->mf_size = ptr->mf_dim + 1;
ptr->mf_size = MF_DIM + 1;
......@@ -127,11 +134,11 @@ class Optimizer {
curand_init(clock64(), tid_x, 0, &state);
for (int i = 0; i < MF_DIM; ++i) {
ptr->mf[i + 1] =
(curand_uniform(&state)) * optimizer_config::mf_initial_range;
(curand_uniform(&state)) * optimizer_config.mf_initial_range;
}
}
} else {
update_mf(MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
update_mf(optimizer_config, MF_DIM, &(ptr->mf[1]), ptr->mf[0], grad.mf_g,
grad.show); // for local test
}
}
......
......@@ -14,50 +14,69 @@ limitations under the License. */
#pragma once
#if defined(PADDLE_WITH_CUDA)
namespace paddle {
namespace framework {
namespace optimizer_config {
class OptimizerConfig {
public:
float nonclk_coeff = 0.1;
float clk_coeff = 1;
__constant__ float nonclk_coeff = 0.1;
__constant__ float clk_coeff = 1;
float min_bound = -10;
float max_bound = 10;
float learning_rate = 0.05;
float initial_g2sum = 3.0;
float initial_range = 0;
__constant__ float min_bound = -10;
__constant__ float max_bound = 10;
__constant__ float learning_rate = 0.05;
__constant__ float initial_g2sum = 3.0;
__constant__ float initial_range = 0;
float mf_create_thresholds = 10;
float mf_learning_rate = 0.05;
float mf_initial_g2sum = 3.0;
float mf_initial_range = 1e-4;
float mf_min_bound = -10;
float mf_max_bound = 10;
__constant__ float mf_create_thresholds = 10;
__constant__ float mf_learning_rate = 0.05;
__constant__ float mf_initial_g2sum = 3.0;
__constant__ float mf_initial_range = 1e-4;
__constant__ float mf_min_bound = -10;
__constant__ float mf_max_bound = 10;
} // namespace optimizer_config
void set_sparse_sgd(float nonclk_coeff, float clk_coeff, float min_bound,
float max_bound, float learning_rate, float initial_g2sum,
float initial_range) {
this->nonclk_coeff = nonclk_coeff;
this->clk_coeff = clk_coeff;
this->min_bound = min_bound;
this->max_bound = max_bound;
this->learning_rate = learning_rate;
this->initial_g2sum = initial_g2sum;
this->initial_range = initial_range;
}
#elif defined(PADDLE_WITH_XPU_KP)
namespace paddle {
namespace framework {
void set_sparse_sgd(const OptimizerConfig& optimizer_config) {
this->nonclk_coeff = optimizer_config.nonclk_coeff;
this->clk_coeff = optimizer_config.clk_coeff;
this->min_bound = optimizer_config.min_bound;
this->max_bound = optimizer_config.max_bound;
this->learning_rate = optimizer_config.learning_rate;
this->initial_g2sum = optimizer_config.initial_g2sum;
this->initial_range = optimizer_config.initial_range;
}
class OptimizerConfig {
public:
float nonclk_coeff;
float clk_coeff;
float min_bound;
float max_bound;
float learning_rate;
float initial_g2sum;
float initial_range;
float mf_create_thresholds;
float mf_learning_rate;
float mf_initial_g2sum;
float mf_initial_range;
float mf_min_bound;
float mf_max_bound;
void set_embedx_sgd(float mf_create_thresholds, float mf_learning_rate,
float mf_initial_g2sum, float mf_initial_range,
float mf_min_bound, float mf_max_bound) {
this->mf_create_thresholds = mf_create_thresholds;
this->mf_learning_rate = mf_learning_rate;
this->mf_initial_g2sum = mf_initial_g2sum;
this->mf_initial_range = mf_initial_range;
this->mf_min_bound = mf_min_bound;
this->mf_max_bound = mf_max_bound;
}
void set_embedx_sgd(const OptimizerConfig& optimizer_config) {
this->mf_create_thresholds = optimizer_config.mf_create_thresholds;
this->mf_learning_rate = optimizer_config.mf_learning_rate;
this->mf_initial_g2sum = optimizer_config.mf_initial_g2sum;
this->mf_initial_range = optimizer_config.mf_initial_range;
this->mf_min_bound = optimizer_config.mf_min_bound;
this->mf_max_bound = optimizer_config.mf_max_bound;
}
};
} // namespace framework
} // namespace paddle
#endif
......@@ -181,35 +181,21 @@ void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
float min_bound, float max_bound,
float learning_rate, float initial_g2sum,
float initial_range) {
cudaMemcpyToSymbol(optimizer_config::nonclk_coeff, &nonclk_coeff,
sizeof(float));
cudaMemcpyToSymbol(optimizer_config::clk_coeff, &clk_coeff, sizeof(float));
cudaMemcpyToSymbol(optimizer_config::min_bound, &min_bound, sizeof(float));
cudaMemcpyToSymbol(optimizer_config::max_bound, &max_bound, sizeof(float));
cudaMemcpyToSymbol(optimizer_config::learning_rate, &learning_rate,
sizeof(float));
cudaMemcpyToSymbol(optimizer_config::initial_g2sum, &initial_g2sum,
sizeof(float));
cudaMemcpyToSymbol(optimizer_config::initial_range, &initial_range,
sizeof(float));
OptimizerConfig optimizer_config;
optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
learning_rate, initial_g2sum, initial_range);
HeterPs_->set_sparse_sgd(optimizer_config);
}
void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float mf_learning_rate, float mf_initial_g2sum,
float mf_initial_range, float mf_min_bound,
float mf_max_bound) {
cudaMemcpyToSymbol(optimizer_config::mf_create_thresholds,
&mf_create_thresholds, sizeof(float));
cudaMemcpyToSymbol(optimizer_config::mf_learning_rate, &mf_learning_rate,
sizeof(float));
cudaMemcpyToSymbol(optimizer_config::mf_initial_g2sum, &mf_initial_g2sum,
sizeof(float));
cudaMemcpyToSymbol(optimizer_config::mf_initial_range, &mf_initial_range,
sizeof(float));
cudaMemcpyToSymbol(optimizer_config::mf_min_bound, &mf_min_bound,
sizeof(float));
cudaMemcpyToSymbol(optimizer_config::mf_max_bound, &mf_max_bound,
sizeof(float));
OptimizerConfig optimizer_config;
optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
mf_initial_g2sum, mf_initial_range,
mf_min_bound, mf_max_bound);
HeterPs_->set_embedx_sgd(optimizer_config);
}
} // end namespace framework
......
......@@ -256,13 +256,8 @@ void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
float learning_rate, float initial_g2sum,
float initial_range) {
OptimizerConfig optimizer_config;
optimizer_config.nonclk_coeff = nonclk_coeff;
optimizer_config.clk_coeff = clk_coeff;
optimizer_config.min_bound = min_bound;
optimizer_config.max_bound = max_bound;
optimizer_config.learning_rate = learning_rate;
optimizer_config.initial_g2sum = initial_g2sum;
optimizer_config.initial_range = initial_range;
optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
learning_rate, initial_g2sum, initial_range);
HeterPs_->set_sparse_sgd(optimizer_config);
}
......@@ -271,12 +266,9 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float mf_initial_range, float mf_min_bound,
float mf_max_bound) {
OptimizerConfig optimizer_config;
optimizer_config.mf_create_thresholds = mf_create_thresholds;
optimizer_config.mf_learning_rate = mf_learning_rate;
optimizer_config.mf_initial_g2sum = mf_initial_g2sum;
optimizer_config.mf_initial_range = mf_initial_range;
optimizer_config.mf_min_bound = mf_min_bound;
optimizer_config.mf_max_bound = mf_max_bound;
optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
mf_initial_g2sum, mf_initial_range,
mf_min_bound, mf_max_bound);
HeterPs_->set_embedx_sgd(optimizer_config);
}
......
......@@ -95,8 +95,46 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
return;
}
void add_sparse_optimizer(
std::unordered_map<std::string, float>& config, // NOLINT
const ::paddle::SparseCommonSGDRuleParameter& sgd_param,
const std::string& prefix = "") {
auto optimizer_name = sgd_param.name();
if (optimizer_name == "naive") {
config[prefix + "learning_rate"] = sgd_param.naive().learning_rate();
config[prefix + "initial_range"] = sgd_param.naive().initial_range();
if (sgd_param.naive().weight_bounds_size() == 2) {
config[prefix + "min_bound"] = sgd_param.naive().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.naive().weight_bounds()[1];
}
} else if (optimizer_name == "adagrad") {
config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
if (sgd_param.adagrad().weight_bounds_size() == 2) {
config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
}
} else if (optimizer_name == "std_adagrad") {
config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
if (sgd_param.adagrad().weight_bounds_size() == 2) {
config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
}
} else if (optimizer_name == "adam") {
config[prefix + "learning_rate"] = sgd_param.adam().learning_rate();
config[prefix + "initial_range"] = sgd_param.adam().initial_range();
if (sgd_param.adam().weight_bounds_size() == 2) {
config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1];
}
}
}
void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
// add for hbmps optimizer config
// optimizer config for hbmps
auto fleet_desc_str = trainer_desc.fleet_desc();
google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param);
auto sparse_table =
......@@ -105,7 +143,7 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
auto sparse_table_accessor_parameter =
sparse_table_accessor.downpour_accessor_param();
auto accessor_class = sparse_table_accessor.accessor_class();
// gpups' sparse table optimizer config
// NOTE(zhangminxu): gpups' sparse table optimizer config,
// now only support single sparse table
// auto sparse_table = param_.sparse_table(0);
std::unordered_map<std::string, float> config;
......@@ -126,7 +164,14 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
config["max_bound"] =
sparse_table_accessor.sparse_sgd_param().weight_bounds()[1];
}
// NOTE(zhangminxu): for DownpourCtrAccessor & DownpourCtrDoubleAccessor,
// optimizer config for embed_w & embedx_w is the same
config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
config["mf_learning_rate"] = config["learning_rate"];
config["mf_initial_g2sum"] = config["initial_g2sum"];
config["mf_initial_range"] = config["initial_range"];
config["mf_min_bound"] = config["min_bound"];
config["mf_max_bound"] = config["max_bound"];
} else if (accessor_class == "DownpourSparseValueAccessor") {
auto optimizer_name = sparse_table_accessor.sparse_commonsgd_param().name();
if (optimizer_name == "naive") {
......@@ -186,71 +231,12 @@ void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) {
accessor_class == "DownpourDoubleUnitAccessor") {
config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
auto optimizer_name = sparse_table_accessor.embedx_sgd_param().name();
if (optimizer_name == "naive") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().naive().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().naive().initial_range();
if (sparse_table_accessor.embedx_sgd_param()
.naive()
.weight_bounds_size() == 2) {
config["mf_min_bound"] =
sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[0];
config["mf_max_bound"] =
sparse_table_accessor.embedx_sgd_param().naive().weight_bounds()[1];
}
} else if (optimizer_name == "adagrad") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
config["mf_initial_g2sum"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
if (sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds_size() == 2) {
config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[0];
config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[1];
}
} else if (optimizer_name == "std_adagrad") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().adagrad().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_range();
config["mf_initial_g2sum"] =
sparse_table_accessor.embedx_sgd_param().adagrad().initial_g2sum();
if (sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds_size() == 2) {
config["mf_min_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[0];
config["mf_max_bound"] = sparse_table_accessor.embedx_sgd_param()
.adagrad()
.weight_bounds()[1];
}
} else if (optimizer_name == "adam") {
config["mf_learning_rate"] =
sparse_table_accessor.embedx_sgd_param().adam().learning_rate();
config["mf_initial_range"] =
sparse_table_accessor.embedx_sgd_param().adam().initial_range();
if (sparse_table_accessor.embedx_sgd_param()
.adam()
.weight_bounds_size() == 2) {
config["mf_min_bound"] =
sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[0];
config["mf_max_bound"] =
sparse_table_accessor.embedx_sgd_param().adam().weight_bounds()[1];
}
}
config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
// optimizer config for embed_w and embedx
add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
add_sparse_optimizer(config, sparse_table_accessor.embedx_sgd_param(),
"mf_");
}
auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
ps_gpu_wrapper->InitializeGPUServer(config);
}
......
......@@ -37,7 +37,7 @@ limitations under the License. */
#include "paddle/phi/backends/dynload/port.h"
#ifdef PADDLE_WITH_PSLIB
#include <pslib.h>
#include "proto/ps.pb.h"
#endif
namespace paddle {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册