未验证 提交 b8d106e1 编写于 作者: D danleifeng 提交者: GitHub

【GPUPS】Adam accessor (#43919)

* add adam/sharedadam optimzier for gpups;edit optimizer struct;test=develop
上级 1882ffd5
...@@ -31,6 +31,7 @@ int CtrDymfAccessor::Initialize() { ...@@ -31,6 +31,7 @@ int CtrDymfAccessor::Initialize() {
_embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name); _embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
_embedx_sgd_rule->LoadConfig(_config.embedx_sgd_param(), _embedx_sgd_rule->LoadConfig(_config.embedx_sgd_param(),
_config.embedx_dim()); _config.embedx_dim());
common_feature_value.optimizer_name = name;
common_feature_value.embed_sgd_dim = _embed_sgd_rule->Dim(); common_feature_value.embed_sgd_dim = _embed_sgd_rule->Dim();
common_feature_value.embedx_dim = _config.embedx_dim(); common_feature_value.embedx_dim = _config.embedx_dim();
...@@ -42,7 +43,10 @@ int CtrDymfAccessor::Initialize() { ...@@ -42,7 +43,10 @@ int CtrDymfAccessor::Initialize() {
if (_config.ctr_accessor_param().show_scale()) { if (_config.ctr_accessor_param().show_scale()) {
_show_scale = true; _show_scale = true;
} }
VLOG(0) << " INTO CtrDymfAccessor::Initialize()"; VLOG(0) << " INTO CtrDymfAccessor::Initialize(); embed_sgd_dim:"
<< common_feature_value.embed_sgd_dim
<< " embedx_dim:" << common_feature_value.embedx_dim
<< " embedx_sgd_dim:" << common_feature_value.embedx_sgd_dim;
InitAccessorInfo(); InitAccessorInfo();
return 0; return 0;
} }
...@@ -53,9 +57,9 @@ void CtrDymfAccessor::InitAccessorInfo() { ...@@ -53,9 +57,9 @@ void CtrDymfAccessor::InitAccessorInfo() {
auto embedx_dim = _config.embedx_dim(); auto embedx_dim = _config.embedx_dim();
VLOG(0) << "InitAccessorInfo embedx_dim:" << embedx_dim; VLOG(0) << "InitAccessorInfo embedx_dim:" << embedx_dim;
_accessor_info.select_dim = 3 + embedx_dim; _accessor_info.select_dim = 4 + embedx_dim;
_accessor_info.select_size = _accessor_info.select_dim * sizeof(float); _accessor_info.select_size = _accessor_info.select_dim * sizeof(float);
_accessor_info.update_dim = 4 + embedx_dim; _accessor_info.update_dim = 5 + embedx_dim;
_accessor_info.update_size = _accessor_info.update_dim * sizeof(float); _accessor_info.update_size = _accessor_info.update_dim * sizeof(float);
_accessor_info.mf_size = _accessor_info.mf_size =
(embedx_dim + common_feature_value.embedx_sgd_dim) * sizeof(float); (embedx_dim + common_feature_value.embedx_sgd_dim) * sizeof(float);
...@@ -179,8 +183,10 @@ int32_t CtrDymfAccessor::Create(float** values, size_t num) { ...@@ -179,8 +183,10 @@ int32_t CtrDymfAccessor::Create(float** values, size_t num) {
value[common_feature_value.ClickIndex()] = 0; value[common_feature_value.ClickIndex()] = 0;
value[common_feature_value.SlotIndex()] = -1; value[common_feature_value.SlotIndex()] = -1;
value[common_feature_value.MfDimIndex()] = -1; value[common_feature_value.MfDimIndex()] = -1;
_embed_sgd_rule->InitValue(value + common_feature_value.EmbedWIndex(), _embed_sgd_rule->InitValue(
value + common_feature_value.EmbedG2SumIndex()); value + common_feature_value.EmbedWIndex(),
value + common_feature_value.EmbedG2SumIndex(),
false); // adam embed init not zero, adagrad embed init zero
_embedx_sgd_rule->InitValue(value + common_feature_value.EmbedxWIndex(), _embedx_sgd_rule->InitValue(value + common_feature_value.EmbedxWIndex(),
value + common_feature_value.EmbedxG2SumIndex(), value + common_feature_value.EmbedxG2SumIndex(),
false); false);
...@@ -293,22 +299,14 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) { ...@@ -293,22 +299,14 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) {
i++) { i++) {
os << " " << v[i]; os << " " << v[i];
} }
// os << " " << common_feature_value.Slot(const_cast<float*>(v)) << " "
// << common_feature_value.MfDim(const_cast<float*>(v));
auto show = common_feature_value.Show(const_cast<float*>(v)); auto show = common_feature_value.Show(const_cast<float*>(v));
auto click = common_feature_value.Click(const_cast<float*>(v)); auto click = common_feature_value.Click(const_cast<float*>(v));
auto score = ShowClickScore(show, click); auto score = ShowClickScore(show, click);
auto mf_dim = int(common_feature_value.MfDim(const_cast<float*>(v)));
if (score >= _config.embedx_threshold() && if (score >= _config.embedx_threshold() &&
param > common_feature_value.EmbedxG2SumIndex()) { param > common_feature_value.EmbedxG2SumIndex()) {
// VLOG(1) << "common_feature_value.EmbedxG2SumIndex():"
// << common_feature_value.EmbedxG2SumIndex();
// VLOG(1) << "common_feature_value.EmbedxWIndex():"
// << common_feature_value.EmbedxWIndex();
// VLOG(1) << "common_feature_value.MfDim():"
// << common_feature_value.MfDim(const_cast<float*>(v));
for (auto i = common_feature_value.EmbedxG2SumIndex(); for (auto i = common_feature_value.EmbedxG2SumIndex();
i < common_feature_value.EmbedxWIndex() + i < common_feature_value.Dim(mf_dim);
common_feature_value.MfDim(const_cast<float*>(v));
++i) { ++i) {
os << " " << v[i]; os << " " << v[i];
} }
......
...@@ -54,10 +54,24 @@ class CtrDymfAccessor : public ValueAccessor { ...@@ -54,10 +54,24 @@ class CtrDymfAccessor : public ValueAccessor {
int ClickIndex() { return ShowIndex() + 1; } int ClickIndex() { return ShowIndex() + 1; }
int EmbedWIndex() { return ClickIndex() + 1; } int EmbedWIndex() { return ClickIndex() + 1; }
int EmbedG2SumIndex() { return EmbedWIndex() + 1; } int EmbedG2SumIndex() { return EmbedWIndex() + 1; }
int SlotIndex() { return EmbedG2SumIndex() + 1; } int SlotIndex() { return EmbedG2SumIndex() + embed_sgd_dim; }
int MfDimIndex() { return SlotIndex() + 1; } int MfDimIndex() { return SlotIndex() + 1; }
int EmbedxG2SumIndex() { return MfDimIndex() + 1; } int EmbedxG2SumIndex() { return MfDimIndex() + 1; }
int EmbedxWIndex() { return EmbedxG2SumIndex() + 1; } int EmbedxWIndex() { return EmbedxG2SumIndex() + embedx_sgd_dim; }
// 根据mf_dim计算的总长度
int Dim(int& mf_dim) {
int tmp_embedx_sgd_dim = 1;
if (optimizer_name == "SparseAdamSGDRule") { // adam
tmp_embedx_sgd_dim = mf_dim * 2 + 2;
} else if (optimizer_name == "SparseSharedAdamSGDRule") { // shared_adam
tmp_embedx_sgd_dim = 4;
}
return 7 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim;
}
// 根据mf_dim计算的总byte数
int Size(int& mf_dim) { return (Dim(mf_dim)) * sizeof(float); }
float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; } float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; }
float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; } float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; }
...@@ -73,6 +87,7 @@ class CtrDymfAccessor : public ValueAccessor { ...@@ -73,6 +87,7 @@ class CtrDymfAccessor : public ValueAccessor {
int embed_sgd_dim; int embed_sgd_dim;
int embedx_dim; int embedx_dim;
int embedx_sgd_dim; int embedx_sgd_dim;
std::string optimizer_name;
}; };
struct CtrDymfPushValue { struct CtrDymfPushValue {
......
...@@ -213,7 +213,6 @@ void SparseAdamSGDRule::UpdateValueWork(float* w, ...@@ -213,7 +213,6 @@ void SparseAdamSGDRule::UpdateValueWork(float* w,
float beta1_pow_ = *beta1_pow; float beta1_pow_ = *beta1_pow;
float beta2_pow_ = *beta2_pow; float beta2_pow_ = *beta2_pow;
// lr not change in one update
lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_); lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
for (size_t i = 0; i < _embedding_dim; i++) { for (size_t i = 0; i < _embedding_dim; i++) {
// Calculation // Calculation
...@@ -252,5 +251,88 @@ void SparseAdamSGDRule::InitValueWork(float* value, ...@@ -252,5 +251,88 @@ void SparseAdamSGDRule::InitValueWork(float* value,
*(sgd + Beta1PowIndex()) = _beta1_decay_rate; *(sgd + Beta1PowIndex()) = _beta1_decay_rate;
*(sgd + Beta2PowIndex()) = _beta2_decay_rate; *(sgd + Beta2PowIndex()) = _beta2_decay_rate;
} }
void SparseSharedAdamSGDRule::LoadConfig(
const SparseCommonSGDRuleParameter& param, size_t emb_dim) {
_embedding_dim = emb_dim;
auto adam_param = param.adam();
learning_rate_ = adam_param.learning_rate();
_initial_range = adam_param.initial_range();
_beta1_decay_rate = adam_param.beta1_decay_rate();
_beta2_decay_rate = adam_param.beta2_decay_rate();
_ada_epsilon = adam_param.ada_epsilon();
if (adam_param.weight_bounds_size() == 0) {
_min_bound = -std::numeric_limits<float>::max();
_max_bound = std::numeric_limits<float>::max();
} else {
CHECK(adam_param.weight_bounds_size() >= 2)
<< "invalid repeated size for weight_bounds:"
<< adam_param.weight_bounds_size();
_min_bound = adam_param.weight_bounds(0);
_max_bound = adam_param.weight_bounds(1);
}
}
void SparseSharedAdamSGDRule::UpdateValueWork(float* w,
float* sgd,
const float* grad,
float scale) {
float* gsum = sgd + GSumIndex();
float* g2sum = sgd + G2SumIndex();
float* beta1_pow = sgd + Beta1PowIndex();
float* beta2_pow = sgd + Beta2PowIndex();
const float* g = grad;
float lr = learning_rate_;
float beta1_pow_ = *beta1_pow;
float beta2_pow_ = *beta2_pow;
float gsum_ = *gsum;
float g2sum_ = *g2sum;
lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
double sum_gsum = 0.0;
double sum_g2sum = 0.0;
for (int i = 0; i < _embedding_dim; i++) {
// Calculation
double new_gsum =
_beta1_decay_rate * gsum_ + (1 - _beta1_decay_rate) * g[i];
double new_g2sum =
_beta2_decay_rate * g2sum_ + (1 - _beta2_decay_rate) * g[i] * g[i];
w[i] = w[i] - lr * (new_gsum / (sqrt(new_g2sum) + _ada_epsilon));
BoundValue(w[i]);
sum_gsum += new_gsum;
sum_g2sum += new_g2sum;
}
// update beta_pow_decay
(*gsum) = sum_gsum / _embedding_dim;
(*g2sum) = sum_g2sum / _embedding_dim;
(*beta1_pow) *= _beta1_decay_rate;
(*beta2_pow) *= _beta2_decay_rate;
}
void SparseSharedAdamSGDRule::InitValueWork(float* value,
float* sgd,
bool zero_init) {
for (int i = 0; i < _embedding_dim; ++i) {
if (zero_init) {
value[i] = 0.0;
BoundValue(value[i]);
} else {
value[i] =
(local_uniform_real_distribution<double>()(local_random_engine()) *
2 -
1) *
_initial_range;
BoundValue(value[i]);
}
}
// init rule gsum and g2sum
for (int i = GSumIndex(); i < Beta1PowIndex(); i++) {
sgd[i] = 0.0;
}
// init beta1_pow and beta2_pow
*(sgd + Beta1PowIndex()) = _beta1_decay_rate;
*(sgd + Beta2PowIndex()) = _beta2_decay_rate;
}
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -144,5 +144,28 @@ class SparseAdamSGDRule : public SparseValueSGDRule { ...@@ -144,5 +144,28 @@ class SparseAdamSGDRule : public SparseValueSGDRule {
float _beta2_decay_rate; float _beta2_decay_rate;
float _ada_epsilon; float _ada_epsilon;
}; };
class SparseSharedAdamSGDRule : public SparseValueSGDRule {
public:
virtual void LoadConfig(const SparseCommonSGDRuleParameter& param,
size_t emb_dim);
virtual void UpdateValueWork(float* w,
float* sgd,
const float* push_value,
float scale);
virtual void InitValueWork(float* value, float* sgd, bool zero_init);
virtual size_t Dim() { return 4; }
size_t GSumIndex() { return 0; }
size_t G2SumIndex() { return GSumIndex() + 1; }
size_t Beta1PowIndex() { return G2SumIndex() + 1; }
size_t Beta2PowIndex() { return Beta1PowIndex() + 1; }
protected:
float learning_rate_;
float _beta1_decay_rate;
float _beta2_decay_rate;
float _ada_epsilon;
};
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -49,6 +49,7 @@ REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule); ...@@ -49,6 +49,7 @@ REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseSharedAdamSGDRule);
int32_t TableManager::Initialize() { int32_t TableManager::Initialize() {
static bool initialized = false; static bool initialized = false;
......
...@@ -13,6 +13,7 @@ cc_library( ...@@ -13,6 +13,7 @@ cc_library(
op_registry op_registry
fs fs
shell shell
ps_gpu_wrapper
${RPC_DEPS}) ${RPC_DEPS})
target_link_libraries(fleet z) target_link_libraries(fleet z)
...@@ -18,6 +18,10 @@ limitations under the License. */ ...@@ -18,6 +18,10 @@ limitations under the License. */
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/distributed/ps/table/table.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#endif
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
...@@ -129,6 +133,13 @@ void FleetWrapper::InitWorker(const std::string& dist_desc, ...@@ -129,6 +133,13 @@ void FleetWrapper::InitWorker(const std::string& dist_desc,
worker_ptr_ = std::shared_ptr<paddle::distributed::PSClient>( worker_ptr_ = std::shared_ptr<paddle::distributed::PSClient>(
paddle::distributed::PSClientFactory::Create(ps_param)); paddle::distributed::PSClientFactory::Create(ps_param));
worker_ptr_->Configure(ps_param, dense_pull_regions, ps_env_, index); worker_ptr_->Configure(ps_param, dense_pull_regions, ps_env_, index);
#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE
VLOG(3) << "FleetWrapper::InitWorker InitializeGPUServer";
auto* accessor = worker_ptr_->GetTableAccessor(0);
auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
ps_gpu_wrapper->InitializeGPUServer(ps_param);
ps_gpu_wrapper->SetTableAccessor(accessor);
#endif
} }
} else { } else {
VLOG(3) << "Client can be initialized only once"; VLOG(3) << "Client can be initialized only once";
...@@ -525,11 +536,11 @@ void FleetWrapper::PushSparseFromTensorAsync( ...@@ -525,11 +536,11 @@ void FleetWrapper::PushSparseFromTensorAsync(
int batch_size = -1; int batch_size = -1;
bool batch_size_consist = true; bool batch_size_consist = true;
for (auto* input : *inputs) { for (auto* input : *inputs) {
int cur_batch_size = size_t cur_batch_size =
input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0]; input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0];
if (batch_size == -1) { if (batch_size == -1) {
batch_size = cur_batch_size; batch_size = int(cur_batch_size);
} else if (batch_size != cur_batch_size) { } else if (batch_size != int(cur_batch_size)) {
// CHECK(batch_size == cur_batch_size); // NOLINT // CHECK(batch_size == cur_batch_size); // NOLINT
batch_size_consist = false; batch_size_consist = false;
break; break;
...@@ -537,12 +548,12 @@ void FleetWrapper::PushSparseFromTensorAsync( ...@@ -537,12 +548,12 @@ void FleetWrapper::PushSparseFromTensorAsync(
} }
CHECK(batch_size > 0); // NOLINT CHECK(batch_size > 0); // NOLINT
int show_size = size_t show_size =
shows->lod().size() ? shows->lod()[0].size() - 1 : shows->dims()[0]; shows->lod().size() ? shows->lod()[0].size() - 1 : shows->dims()[0];
CHECK(show_size == batch_size || show_size == 1); CHECK(show_size == size_t(batch_size) || show_size == 1);
int clk_size = size_t clk_size =
clks->lod().size() ? clks->lod()[0].size() - 1 : clks->dims()[0]; clks->lod().size() ? clks->lod()[0].size() - 1 : clks->dims()[0];
CHECK(clk_size == batch_size || clk_size == 1); CHECK(clk_size == size_t(batch_size) || clk_size == 1);
CHECK(outputs->size() == inputs->size()); CHECK(outputs->size() == inputs->size());
std::vector<uint64_t> push_keys; std::vector<uint64_t> push_keys;
...@@ -601,12 +612,10 @@ void FleetWrapper::PushSparseFromTensorAsync( ...@@ -601,12 +612,10 @@ void FleetWrapper::PushSparseFromTensorAsync(
// in // in
// ctr_accessor.h // ctr_accessor.h
push_values.back()[0] = 2; // TODO(zhaocaibei123): slot push_values.back()[0] = 2; // TODO(zhaocaibei123): slot
push_values.back()[1] = (static_cast<int>(i) >= show_size push_values.back()[1] =
? 1 (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
: static_cast<float>(show_tensor[i])); push_values.back()[2] =
push_values.back()[2] = (static_cast<int>(i) >= clk_size (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
? 0
: static_cast<float>(clk_tensor[i]));
float* data = push_values.back().data() + 3; float* data = push_values.back().data() + 3;
memcpy(data, g + output_len, sizeof(float) * fea_dim); memcpy(data, g + output_len, sizeof(float) * fea_dim);
} }
...@@ -630,12 +639,10 @@ void FleetWrapper::PushSparseFromTensorAsync( ...@@ -630,12 +639,10 @@ void FleetWrapper::PushSparseFromTensorAsync(
// slot show clk grad... consistent with CtrCommonPushValue defined in // slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h // ctr_accessor.h
push_values.back()[0] = 2; // TODO(zhaocaibei123): slot push_values.back()[0] = 2; // TODO(zhaocaibei123): slot
push_values.back()[1] = (static_cast<int>(i) >= show_size push_values.back()[1] =
? 1 (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
: static_cast<float>(show_tensor[i])); push_values.back()[2] =
push_values.back()[2] = (static_cast<int>(i) >= clk_size (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
? 0
: static_cast<float>(clk_tensor[i]));
float* data = push_values.back().data() + 3; float* data = push_values.back().data() + 3;
memcpy(data, g + output_len, sizeof(float) * fea_dim); memcpy(data, g + output_len, sizeof(float) * fea_dim);
} }
......
...@@ -197,14 +197,14 @@ message TableParameter { ...@@ -197,14 +197,14 @@ message TableParameter {
message TableAccessorParameter { message TableAccessorParameter {
optional string accessor_class = 1; optional string accessor_class = 1;
optional SGDParameter embed_sgd_param = 2;
optional SGDParameter embedx_sgd_param = 3;
optional uint32 fea_dim = 4 [ default = 11 ]; // field size of one value optional uint32 fea_dim = 4 [ default = 11 ]; // field size of one value
optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size
optional uint32 embedx_threshold = 6 optional uint32 embedx_threshold = 6
[ default = 10 ]; // embedx feature create threshold [ default = 10 ]; // embedx feature create threshold
optional CtrAccessorParameter ctr_accessor_param = 7; optional CtrAccessorParameter ctr_accessor_param = 7;
repeated TableAccessorSaveParameter table_accessor_save_param = 8; repeated TableAccessorSaveParameter table_accessor_save_param = 8;
optional SGDParameter embed_sgd_param = 10;
optional SGDParameter embedx_sgd_param = 11;
} }
message SGDParameter { message SGDParameter {
...@@ -228,7 +228,7 @@ message ...@@ -228,7 +228,7 @@ message
repeated float weight_bounds = 4; repeated float weight_bounds = 4;
} }
message SparseAdamSGDParameter { // SparseAdamSGDRule message SparseAdamSGDParameter { // SparseAdamSGDRule | SparseSharedAdamSGDRule
optional double learning_rate = 1 [ default = 0.001 ]; optional double learning_rate = 1 [ default = 0.001 ];
optional double initial_range = 2 [ default = 0.0001 ]; optional double initial_range = 2 [ default = 0.0001 ];
optional double beta1_decay_rate = 3 [ default = 0.9 ]; optional double beta1_decay_rate = 3 [ default = 0.9 ];
......
...@@ -25,10 +25,17 @@ endif() ...@@ -25,10 +25,17 @@ endif()
if(WITH_HETERPS) if(WITH_HETERPS)
if(WITH_NCCL AND WITH_GPU) if(WITH_NCCL AND WITH_GPU)
nv_library( if(WITH_PSCORE)
ps_gpu_wrapper nv_library(
SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc ps_gpu_wrapper
DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
DEPS heter_ps gloo_wrapper ps_framework_proto ${BRPC_DEPS})
else()
nv_library(
ps_gpu_wrapper
SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
endif()
add_subdirectory(heter_ps) add_subdirectory(heter_ps)
elseif(WITH_XPU_KP) elseif(WITH_XPU_KP)
xpu_library( xpu_library(
......
...@@ -81,7 +81,6 @@ class HeterContext { ...@@ -81,7 +81,6 @@ class HeterContext {
std::vector<std::vector<FeatureValue>> device_values_; std::vector<std::vector<FeatureValue>> device_values_;
std::vector<std::vector<FeatureKey>> device_keys_; std::vector<std::vector<FeatureKey>> device_keys_;
std::vector<std::vector<std::vector<FeatureKey>>> device_dim_keys_; std::vector<std::vector<std::vector<FeatureKey>>> device_dim_keys_;
std::vector<std::vector<std::vector<FeatureValue>>> device_dim_values_;
std::vector<std::mutex*> mutex_; std::vector<std::mutex*> mutex_;
std::vector<std::vector<std::mutex*>> dim_mutex_; std::vector<std::vector<std::mutex*>> dim_mutex_;
int multi_mf_dim_ = 0; int multi_mf_dim_ = 0;
...@@ -114,7 +113,6 @@ class HeterContext { ...@@ -114,7 +113,6 @@ class HeterContext {
value_dim_ptr_[i].resize(dim_num); value_dim_ptr_[i].resize(dim_num);
} }
device_values_.resize(device_num); device_values_.resize(device_num);
device_dim_values_.resize(device_num);
device_keys_.resize(device_num); device_keys_.resize(device_num);
device_dim_keys_.resize(device_num); device_dim_keys_.resize(device_num);
......
...@@ -9,16 +9,16 @@ if(WITH_GPU) ...@@ -9,16 +9,16 @@ if(WITH_GPU)
endif() endif()
nv_library( nv_library(
heter_comm_kernel heter_comm_kernel
SRCS heter_comm_kernel.cu feature_value.h SRCS heter_comm_kernel.cu feature_value.h feature_value.cu
DEPS ${HETERPS_DEPS}) DEPS ${HETERPS_DEPS})
nv_library( nv_library(
hashtable_kernel hashtable_kernel
SRCS hashtable_kernel.cu feature_value.h SRCS hashtable_kernel.cu feature_value.h feature_value.cu
DEPS ${HETERPS_DEPS}) DEPS ${HETERPS_DEPS})
nv_library( nv_library(
heter_comm heter_comm
SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h SRCS heter_comm.h feature_value.h feature_value.cu heter_resource.cc
mem_pool.h heter_resource.h mem_pool.h
DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
nv_test( nv_test(
test_heter_comm test_heter_comm
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
namespace paddle {
namespace framework {
template <typename FVAccessor>
__global__ void PullCopy(float** dest,
const float* src,
const int64_t* len,
int slot_num,
int total_len,
uint64_t** keys,
uint64_t max_val_size,
int* gpu_dim,
FVAccessor feature_value_accessor) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < len[mid])
high = mid;
else
low = mid + 1;
}
int x = low;
int y = i - (x ? len[x - 1] : 0);
float* feature_value_ptr =
(float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
int mf_dim = gpu_dim[x] - 3;
feature_value_accessor.Select(
dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
}
}
template <typename FVAccessor>
__global__ void PushCopyWithPool(float* dest,
float** src,
int64_t* len,
int slot_num,
uint64_t total_len,
int bs,
int* slot_vector,
int* mf_dim_vector,
size_t grad_value_size,
FVAccessor feature_value_accessor) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < len[mid])
high = mid;
else
low = mid + 1;
}
int x = low;
int y = i - (x ? len[low - 1] : 0);
float* cur = (float*)((char*)dest + i * grad_value_size);
cur[feature_value_accessor.common_push_value.SlotIndex()] =
(float)slot_vector[x];
int mf_dim = mf_dim_vector[x];
cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim;
cur[feature_value_accessor.common_push_value.ShowIndex()] =
*(src[x] + y * (mf_dim + 3));
cur[feature_value_accessor.common_push_value.ClickIndex()] =
*(src[x] + y * (mf_dim + 3) + 1);
cur[feature_value_accessor.common_push_value.EmbedGIndex()] =
*(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
for (int j = 0; j < mf_dim; j++) {
cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] =
*(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
}
}
}
template <typename GPUAccessor>
void AccessorWrapper<GPUAccessor>::CopyForPullImpl(
const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const float* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length,
int* gpu_dim,
int feature_value_size) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
cudaMemcpy(gpu_values,
values.data(),
values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
gpu_values,
total_values_gpu,
gpu_len,
slot_num,
total_length,
gpu_keys,
feature_value_size,
gpu_dim,
gpu_accessor_);
cudaStreamSynchronize(stream);
}
template <typename GPUAccessor>
void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
float* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const uint64_t total_length,
const int batch_size,
size_t grad_value_size,
std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
auto slot_lengths_lod = slot_lengths;
for (int i = 1; i < slot_lengths_lod.size(); i++) {
slot_lengths_lod[i] += slot_lengths_lod[i - 1];
}
auto buf_grad_value =
memory::Alloc(place, grad_values.size() * sizeof(float*));
auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
auto buf_slot_vector =
memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
auto buf_mf_dim_vector =
memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
cudaMemcpy(gpu_values,
grad_values.data(),
grad_values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_len,
slot_lengths_lod.data(),
slot_lengths.size() * sizeof(int64_t),
cudaMemcpyHostToDevice);
cudaMemcpy(d_slot_vector,
slot_vector.data(),
slot_lengths_lod.size() * sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(d_mf_dim_vector,
slot_mf_dim_vector.data(),
slot_lengths_lod.size() * sizeof(int),
cudaMemcpyHostToDevice);
PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
total_grad_values_gpu,
gpu_values,
gpu_len,
slot_lengths.size(),
total_length,
batch_size,
d_slot_vector,
d_mf_dim_vector,
grad_value_size,
gpu_accessor_);
cudaStreamSynchronize(stream);
}
#ifdef PADDLE_WITH_PSCORE
template class AccessorWrapper<CommonFeatureValueAccessor>;
#endif
} // namespace framework
} // namespace paddle
#endif
...@@ -25,10 +25,12 @@ ...@@ -25,10 +25,12 @@
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class GpuPsGraphTable : public HeterComm<uint64_t, int64_t, int> { class GpuPsGraphTable
: public HeterComm<uint64_t, int64_t, int, CommonFeatureValueAccessor> {
public: public:
GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware) GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware)
: HeterComm<uint64_t, int64_t, int>(1, resource) { : HeterComm<uint64_t, int64_t, int, CommonFeatureValueAccessor>(
1, resource) {
load_factor_ = 0.25; load_factor_ = 0.25;
rw_lock.reset(new pthread_rwlock_t()); rw_lock.reset(new pthread_rwlock_t());
gpu_num = resource_->total_device(); gpu_num = resource_->total_device();
......
...@@ -137,8 +137,12 @@ class HashTable { ...@@ -137,8 +137,12 @@ class HashTable {
size_t len, size_t len,
StreamType stream); StreamType stream);
template <typename StreamType> template <typename StreamType, typename FVAccessor>
void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream); void get(const KeyType* d_keys,
char* d_vals,
size_t len,
StreamType stream,
FVAccessor& fv_accessor);
void show(); void show();
...@@ -150,9 +154,9 @@ class HashTable { ...@@ -150,9 +154,9 @@ class HashTable {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
template <typename GradType, typename Sgd, typename StreamType> template <typename Sgd, typename StreamType>
void update(const KeyType* d_keys, void update(const KeyType* d_keys,
const GradType* d_grads, const float* d_grads,
size_t len, size_t len,
Sgd sgd, Sgd sgd,
StreamType stream); StreamType stream);
......
...@@ -83,36 +83,25 @@ __global__ void search_kernel(Table* table, ...@@ -83,36 +83,25 @@ __global__ void search_kernel(Table* table,
} }
} }
template <typename Table> template <typename Table, typename FVAccessor>
__global__ void dy_mf_search_kernel(Table* table, __global__ void dy_mf_search_kernel(Table* table,
const typename Table::key_type* const keys, const typename Table::key_type* const keys,
char* vals, char* vals,
size_t len, size_t len,
size_t pull_feature_value_size) { size_t pull_feature_value_size,
FVAccessor feature_value_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) { if (i < len) {
auto it = table->find(keys[i]); auto it = table->find(keys[i]);
if (it != table->end()) { if (it != table->end()) {
uint64_t offset = i * pull_feature_value_size; uint64_t offset = i * pull_feature_value_size;
FeatureValue* cur = (FeatureValue*)(vals + offset); float* cur = (float*)(vals + offset);
FeatureValue& input = *(FeatureValue*)(it->second); float* input = it->second;
cur->slot = input.slot; int mf_dim =
cur->show = input.show; int(input[feature_value_accessor.common_feature_value.MfDimIndex()]);
cur->clk = input.clk;
cur->mf_dim = input.mf_dim; feature_value_accessor.FeatureValueFill(cur, input, mf_dim);
cur->lr = input.lr;
cur->mf_size = input.mf_size;
cur->cpu_ptr = input.cpu_ptr;
cur->delta_score = input.delta_score;
cur->lr_g2sum = input.lr_g2sum;
for (int j = 0; j < cur->mf_dim + 1; ++j) {
cur->mf[j] = input.mf[j];
}
} else {
if (keys[i] != 0) {
printf("warning::pull miss key: %llu", keys[i]);
}
} }
} }
} }
...@@ -145,8 +134,8 @@ __global__ void dy_mf_update_kernel(Table* table, ...@@ -145,8 +134,8 @@ __global__ void dy_mf_update_kernel(Table* table,
if (i < len) { if (i < len) {
auto it = table->find(keys[i]); auto it = table->find(keys[i]);
if (it != table->end()) { if (it != table->end()) {
FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size); float* cur = (float*)(grads + i * grad_value_size);
sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur); sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, cur);
} else { } else {
if (keys[i] != 0) { if (keys[i] != 0) {
printf("warning::push miss key: %llu", keys[i]); printf("warning::push miss key: %llu", keys[i]);
...@@ -212,17 +201,18 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ...@@ -212,17 +201,18 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
} }
template <typename KeyType, typename ValType> template <typename KeyType, typename ValType>
template <typename StreamType> template <typename StreamType, typename FVAccessor>
void HashTable<KeyType, ValType>::get(const KeyType* d_keys, void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
char* d_vals, char* d_vals,
size_t len, size_t len,
StreamType stream) { StreamType stream,
FVAccessor& fv_accessor) {
if (len == 0) { if (len == 0) {
return; return;
} }
const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
dy_mf_search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>( dy_mf_search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
container_, d_keys, d_vals, len, pull_feature_value_size_); container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor);
} }
template <typename KeyType, typename ValType> template <typename KeyType, typename ValType>
...@@ -298,27 +288,6 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) { ...@@ -298,27 +288,6 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
cpu_val[x + 7] = gpu_val.mf[x]; cpu_val[x + 7] = gpu_val.mf[x];
} }
} }
#endif
#ifdef PADDLE_WITH_PSCORE
auto* downpour_value =
(paddle::distributed::FixedFeatureValue*)(gpu_val.cpu_ptr);
int downpour_value_size = downpour_value->size();
if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
downpour_value->resize(gpu_val.mf_size + downpour_value_size);
}
float* cpu_val = downpour_value->data();
// cpu_val[0] = 0;
cpu_val[2] = gpu_val.delta_score;
cpu_val[3] = gpu_val.show;
cpu_val[4] = gpu_val.clk;
cpu_val[5] = gpu_val.lr;
cpu_val[6] = gpu_val.lr_g2sum;
cpu_val[0] = gpu_val.slot;
if (gpu_val.mf_size > 0) {
for (int x = 0; x < gpu_val.mf_size; x++) {
cpu_val[x + 7] = gpu_val.mf[x];
}
}
#endif #endif
} }
}; };
...@@ -336,9 +305,9 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) { ...@@ -336,9 +305,9 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
} }
template <typename KeyType, typename ValType> template <typename KeyType, typename ValType>
template <typename GradType, typename Sgd, typename StreamType> template <typename Sgd, typename StreamType>
void HashTable<KeyType, ValType>::update(const KeyType* d_keys, void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
const GradType* d_grads, const float* d_grads,
size_t len, size_t len,
Sgd sgd, Sgd sgd,
StreamType stream) { StreamType stream) {
...@@ -371,8 +340,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys, ...@@ -371,8 +340,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
push_grad_value_size_); push_grad_value_size_);
} }
template class HashTable<unsigned long, paddle::framework::FeatureValue>; template class HashTable<unsigned long, float>;
template class HashTable<unsigned long, paddle::framework::FeatureValue*>; template class HashTable<unsigned long, float*>;
template class HashTable<long, int>; template class HashTable<long, int>;
template class HashTable<unsigned long, int>; template class HashTable<unsigned long, int>;
template class HashTable<unsigned long, unsigned long>; template class HashTable<unsigned long, unsigned long>;
...@@ -382,15 +351,19 @@ template class HashTable<long, long>; ...@@ -382,15 +351,19 @@ template class HashTable<long, long>;
template class HashTable<long, unsigned long>; template class HashTable<long, unsigned long>;
template class HashTable<long, unsigned int>; template class HashTable<long, unsigned int>;
template void HashTable<unsigned long, paddle::framework::FeatureValue>::get< template void HashTable<unsigned long, float>::get<cudaStream_t>(
cudaStream_t>(const unsigned long* d_keys, const unsigned long* d_keys,
paddle::framework::FeatureValue* d_vals, float* d_vals,
size_t len, size_t len,
cudaStream_t stream); cudaStream_t stream);
template void template void
HashTable<unsigned long, paddle::framework::FeatureValue*>::get<cudaStream_t>( HashTable<unsigned long, float*>::get<cudaStream_t, CommonFeatureValueAccessor>(
const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream); const unsigned long* d_keys,
char* d_vals,
size_t len,
cudaStream_t stream,
CommonFeatureValueAccessor& fv_accessor);
template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys, template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
int* d_vals, int* d_vals,
...@@ -399,6 +372,12 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys, ...@@ -399,6 +372,12 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
template void HashTable<unsigned long, int>::get<cudaStream_t>( template void HashTable<unsigned long, int>::get<cudaStream_t>(
const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream); const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream);
template void HashTable<unsigned long, unsigned long>::get<cudaStream_t>(
const unsigned long* d_keys,
unsigned long* d_vals,
size_t len,
cudaStream_t stream);
template void HashTable<long, unsigned long>::get<cudaStream_t>( template void HashTable<long, unsigned long>::get<cudaStream_t>(
const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream); const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys, template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
...@@ -414,19 +393,19 @@ template void HashTable<unsigned long, long>::get<cudaStream_t>( ...@@ -414,19 +393,19 @@ template void HashTable<unsigned long, long>::get<cudaStream_t>(
// const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t // const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
// stream); // stream);
template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert< template void HashTable<unsigned long, float>::insert<cudaStream_t>(
cudaStream_t>(const unsigned long* d_keys, const unsigned long* d_keys,
const paddle::framework::FeatureValue* d_vals, const float* d_vals,
size_t len, size_t len,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<unsigned long, paddle::framework::FeatureValue*>:: template void HashTable<unsigned long, float*>::insert<cudaStream_t>(
insert<cudaStream_t>(const unsigned long* d_keys, const unsigned long* d_keys,
size_t len, size_t len,
char* pool, char* pool,
size_t feature_value_size, size_t feature_value_size,
size_t start_index, size_t start_index,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys, template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
const int* d_vals, const int* d_vals,
...@@ -460,30 +439,37 @@ template void HashTable<unsigned long, long>::insert<cudaStream_t>( ...@@ -460,30 +439,37 @@ template void HashTable<unsigned long, long>::insert<cudaStream_t>(
size_t len, size_t len,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<unsigned long, paddle::framework::FeatureValue>:: template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream); const unsigned long* d_keys,
const unsigned long* d_vals,
size_t len,
cudaStream_t stream);
template void HashTable<unsigned long, paddle::framework::FeatureValue>::update< template void HashTable<unsigned long, float*>::dump_to_cpu<cudaStream_t>(
paddle::framework::FeaturePushValue, int devid, cudaStream_t stream);
Optimizer<paddle::framework::FeatureValue,
paddle::framework::FeaturePushValue>, template void
HashTable<unsigned long, float*>::update<SparseAdagradOptimizer, cudaStream_t>(
const unsigned long* d_keys,
const char* d_grads,
size_t len,
SparseAdagradOptimizer sgd,
cudaStream_t stream);
template void
HashTable<unsigned long, float*>::update<SparseAdamOptimizer, cudaStream_t>(
const unsigned long* d_keys,
const char* d_grads,
size_t len,
SparseAdamOptimizer sgd,
cudaStream_t stream);
template void HashTable<unsigned long, float*>::update<
SparseAdamSharedOptimizer,
cudaStream_t>(const unsigned long* d_keys, cudaStream_t>(const unsigned long* d_keys,
const paddle::framework::FeaturePushValue* d_grads, const char* d_grads,
size_t len, size_t len,
Optimizer<paddle::framework::FeatureValue, SparseAdamSharedOptimizer sgd,
paddle::framework::FeaturePushValue> sgd,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
update<Optimizer<paddle::framework::FeatureValue,
paddle::framework::FeaturePushValue>,
cudaStream_t>(const unsigned long* d_keys,
const char* d_grads,
size_t len,
Optimizer<paddle::framework::FeatureValue,
paddle::framework::FeaturePushValue> sgd,
cudaStream_t stream);
// template void HashTable<unsigned long, // template void HashTable<unsigned long,
// paddle::framework::FeatureValue>::update< // paddle::framework::FeatureValue>::update<
// Optimizer<paddle::framework::FeatureValue, // Optimizer<paddle::framework::FeatureValue,
......
...@@ -46,7 +46,10 @@ namespace framework { ...@@ -46,7 +46,10 @@ namespace framework {
#define TYPEALIGN(ALIGNVAL, LEN) \ #define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
typename ValType,
typename GradType,
typename FVAccessor>
class HeterComm { class HeterComm {
public: public:
HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource); HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource);
...@@ -65,12 +68,9 @@ class HeterComm { ...@@ -65,12 +68,9 @@ class HeterComm {
GradType* d_grads, GradType* d_grads,
size_t len, size_t len,
int& uniq_len); // NOLINT int& uniq_len); // NOLINT
void dynamic_merge_grad(int gpu_num, void dynamic_merge_grad(
KeyType* d_keys, int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len);
GradType* d_grads, void pull_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
size_t len,
int& uniq_len);
void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len);
void build_ps(int num, void build_ps(int num,
KeyType* h_keys, KeyType* h_keys,
ValType* h_vals, ValType* h_vals,
...@@ -92,7 +92,7 @@ class HeterComm { ...@@ -92,7 +92,7 @@ class HeterComm {
template <typename Sgd> template <typename Sgd>
void push_sparse(int num, void push_sparse(int num,
KeyType* d_keys, KeyType* d_keys,
GradType* d_grads, float* d_grads,
size_t len, size_t len,
Sgd& sgd); // NOLINT Sgd& sgd); // NOLINT
#elif defined(PADDLE_WITH_XPU_KP) #elif defined(PADDLE_WITH_XPU_KP)
...@@ -149,6 +149,13 @@ class HeterComm { ...@@ -149,6 +149,13 @@ class HeterComm {
multi_mf_dim_ = multi_mf_dim; multi_mf_dim_ = multi_mf_dim;
max_mf_dim_ = max_mf_dim; max_mf_dim_ = max_mf_dim;
} }
void set_accessor(FVAccessor& accessor) {
feature_value_accessor_ = accessor;
// for (auto& ptr_table: ptr_tables_) {
// ptr_table->set_accessor(feature_value_accessor_);
// }
}
#endif #endif
bool need_transfer(int send_id, int receive_id) { bool need_transfer(int send_id, int receive_id) {
...@@ -282,9 +289,11 @@ class HeterComm { ...@@ -282,9 +289,11 @@ class HeterComm {
char* src_val, char* src_val,
size_t val_size); size_t val_size);
FVAccessor feature_value_accessor_;
protected: protected:
using Table = HashTable<KeyType, ValType>; using Table = HashTable<KeyType, ValType>;
using PtrTable = HashTable<KeyType, ValType*>; using PtrTable = HashTable<KeyType, float*>;
std::vector<Table*> tables_; std::vector<Table*> tables_;
std::vector<PtrTable*> ptr_tables_; std::vector<PtrTable*> ptr_tables_;
std::shared_ptr<HeterPsResource> resource_; std::shared_ptr<HeterPsResource> resource_;
......
...@@ -128,22 +128,28 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ...@@ -128,22 +128,28 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals,
} }
} }
template <typename KeyType, typename GradType, typename T> template <typename KeyType, typename T, typename FVAccessor>
__global__ void dy_mf_fill_shard_grads_kernel(KeyType* d_shard_keys, __global__ void dy_mf_fill_shard_grads_kernel(
KeyType* d_keys, KeyType* d_shard_keys,
GradType* d_shard_grads, KeyType* d_keys,
GradType* d_grads, float* d_shard_grads,
T* idx, float* d_grads,
size_t len, T* idx,
size_t grad_value_size) { size_t len,
size_t grad_value_size,
FVAccessor feature_value_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) { if (i < len) {
d_shard_keys[i] = d_keys[idx[i]]; d_shard_keys[i] = d_keys[idx[i]];
*(GradType*)((char*)d_shard_grads + i * grad_value_size) = float* cur = (float*)((char*)d_shard_grads + i * grad_value_size);
*(GradType*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); float* shard_val =
(float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size);
feature_value_accessor.PushValueFill(cur, shard_val);
} }
} }
template <typename FVAccessor>
__global__ void merge_gradients_kernel(const uint32_t* offset, __global__ void merge_gradients_kernel(const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
...@@ -151,36 +157,40 @@ __global__ void merge_gradients_kernel(const uint32_t* offset, ...@@ -151,36 +157,40 @@ __global__ void merge_gradients_kernel(const uint32_t* offset,
char* output, char* output,
int n, int n,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_) { DynamicGradMerger& merger,
FVAccessor& feature_value_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) { if (i < n) {
uint32_t start = offset[i]; uint32_t start = offset[i];
uint32_t num = fea_num[i]; uint32_t num = fea_num[i];
int ori_index = index[start]; int ori_index = index[start];
FeaturePushValue& out = *(FeaturePushValue*)(output + i * grad_value_size); float* out = (float*)(output + i * grad_value_size);
FeaturePushValue& in = float* in = (float*)(input + size_t(ori_index) * grad_value_size);
*(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); merger.update_one(out, in, feature_value_accessor);
merger_.update_one(out, in);
for (int j = 1; j < num; ++j) { for (int j = 1; j < num; ++j) {
ori_index = index[start + j]; ori_index = index[start + j];
FeaturePushValue& rhs = in = (float*)(input + size_t(ori_index) * grad_value_size);
*(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); merger.merge_one(out, in, feature_value_accessor);
merger_.merge_one(out, rhs);
} }
} }
} }
template <typename ValType, typename T> template <typename T, typename FVAccessor>
__global__ void dy_mf_fill_dvals_kernel(ValType* d_shard_vals, __global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals,
ValType* d_vals, float* d_vals,
T* idx, T* idx,
size_t len, size_t len,
size_t val_size) { size_t val_size,
FVAccessor feature_value_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) { if (i < len) {
uint64_t new_offset = uint64_t(idx[i]) * val_size; uint64_t new_offset = uint64_t(idx[i]) * val_size;
*(ValType*)((char*)d_vals + new_offset) = float* cur = (float*)((char*)d_vals + new_offset);
*(ValType*)((char*)d_shard_vals + i * val_size); float* shard_val = (float*)((char*)d_shard_vals + uint64_t(i) * val_size);
int mf_dim = int(
shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]);
feature_value_accessor.FeatureValueFill(cur, shard_val, mf_dim);
} }
} }
...@@ -312,15 +322,20 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage, ...@@ -312,15 +322,20 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
debug_synchronous)); debug_synchronous));
} }
template <typename KeyType, typename GradType, typename T, typename StreamType> template <typename KeyType,
void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys, typename T,
KeyType* d_keys, typename StreamType,
GradType* d_shard_grads, typename FVAccessor>
GradType* d_grads, void HeterCommKernel::dy_mf_fill_shard_grads(
T* idx, KeyType* d_shard_keys,
long long len, KeyType* d_keys,
size_t grad_value_size, float* d_shard_grads,
const StreamType& stream) { float* d_grads,
T* idx,
long long len,
size_t grad_value_size,
const StreamType& stream,
FVAccessor& feature_value_accessor) {
int grid_size = (len - 1) / block_size_ + 1; int grid_size = (len - 1) / block_size_ + 1;
size_t c_len = (size_t)len; size_t c_len = (size_t)len;
dy_mf_fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>( dy_mf_fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>(
...@@ -330,10 +345,11 @@ void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys, ...@@ -330,10 +345,11 @@ void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys,
d_grads, d_grads,
idx, idx,
c_len, c_len,
grad_value_size); grad_value_size,
feature_value_accessor);
} }
template <typename StreamType> template <typename StreamType, typename FVAccessor>
void HeterCommKernel::merge_gradient(const uint32_t* offset, void HeterCommKernel::merge_gradient(const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
...@@ -342,23 +358,33 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset, ...@@ -342,23 +358,33 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset,
int n, int n,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_, DynamicGradMerger& merger_,
const StreamType& stream) { const StreamType& stream,
FVAccessor& feature_value_accessor) {
int grid_size = (n - 1) / block_size_ + 1; int grid_size = (n - 1) / block_size_ + 1;
merge_gradients_kernel<<<grid_size, block_size_, 0, stream>>>( merge_gradients_kernel<<<grid_size, block_size_, 0, stream>>>(
offset, fea_num, index, input, output, n, grad_value_size, merger_); offset,
fea_num,
index,
input,
output,
n,
grad_value_size,
merger_,
feature_value_accessor);
} }
template <typename ValType, typename T, typename StreamType> template <typename T, typename StreamType, typename FVAccessor>
void HeterCommKernel::dy_mf_fill_dvals(ValType* d_shard_vals, void HeterCommKernel::dy_mf_fill_dvals(float* d_shard_vals,
ValType* d_vals, float* d_vals,
T* idx, T* idx,
long long len, long long len,
size_t val_size, size_t val_size,
const StreamType& stream) { const StreamType& stream,
FVAccessor& feature_value_accessor) {
int grid_size = (len - 1) / block_size_ + 1; int grid_size = (len - 1) / block_size_ + 1;
size_t c_len = (size_t)len; size_t c_len = (size_t)len;
dy_mf_fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>( dy_mf_fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>(
d_shard_vals, d_vals, idx, c_len, val_size); d_shard_vals, d_vals, idx, c_len, val_size, feature_value_accessor);
} }
template void HeterCommKernel::fill_idx<int, cudaStream_t>( template void HeterCommKernel::fill_idx<int, cudaStream_t>(
...@@ -402,17 +428,15 @@ template void HeterCommKernel::fill_shard_key<unsigned long, int, cudaStream_t>( ...@@ -402,17 +428,15 @@ template void HeterCommKernel::fill_shard_key<unsigned long, int, cudaStream_t>(
long long len, long long len,
const cudaStream_t& stream); const cudaStream_t& stream);
template void HeterCommKernel::fill_shard_grads< template void
unsigned long, HeterCommKernel::fill_shard_grads<unsigned long, float, int, cudaStream_t>(
paddle::framework::FeaturePushValue, unsigned long* d_shard_keys,
int, unsigned long* d_keys,
cudaStream_t>(unsigned long* d_shard_keys, float* d_shard_grads,
unsigned long* d_keys, float* d_grads,
paddle::framework::FeaturePushValue* d_shard_grads, int* idx,
paddle::framework::FeaturePushValue* d_grads, long long len,
int* idx, const cudaStream_t& stream);
long long len,
const cudaStream_t& stream);
template void template void
HeterCommKernel::fill_dvals<paddle::framework::FeatureValue, int, cudaStream_t>( HeterCommKernel::fill_dvals<paddle::framework::FeatureValue, int, cudaStream_t>(
...@@ -467,20 +491,23 @@ template void HeterCommKernel::reduce_by_key< ...@@ -467,20 +491,23 @@ template void HeterCommKernel::reduce_by_key<
cudaStream_t stream, cudaStream_t stream,
bool debug_synchronous); bool debug_synchronous);
template void HeterCommKernel::dy_mf_fill_shard_grads< template void
unsigned long, HeterCommKernel::dy_mf_fill_shard_grads<unsigned long,
paddle::framework::FeaturePushValue, int,
int, cudaStream_t,
cudaStream_t>(unsigned long* d_shard_keys, CommonFeatureValueAccessor>(
unsigned long* d_keys, unsigned long* d_shard_keys,
paddle::framework::FeaturePushValue* d_shard_grads, unsigned long* d_keys,
paddle::framework::FeaturePushValue* d_grads, float* d_shard_grads,
int* idx, float* d_grads,
long long len, int* idx,
size_t grad_value_size, long long len,
const cudaStream_t& stream); size_t grad_value_size,
const cudaStream_t& stream,
template void HeterCommKernel::merge_gradient<cudaStream_t>( CommonFeatureValueAccessor& feature_value_accessor);
template void
HeterCommKernel::merge_gradient<cudaStream_t, CommonFeatureValueAccessor>(
const uint32_t* offset, const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
...@@ -489,16 +516,18 @@ template void HeterCommKernel::merge_gradient<cudaStream_t>( ...@@ -489,16 +516,18 @@ template void HeterCommKernel::merge_gradient<cudaStream_t>(
int n, int n,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_, DynamicGradMerger& merger_,
const cudaStream_t& stream); const cudaStream_t& stream,
CommonFeatureValueAccessor& feature_value_accessor);
template void HeterCommKernel:: template void HeterCommKernel::
dy_mf_fill_dvals<paddle::framework::FeatureValue, int, cudaStream_t>( dy_mf_fill_dvals<int, cudaStream_t, CommonFeatureValueAccessor>(
paddle::framework::FeatureValue* d_shard_vals, float* d_shard_vals,
paddle::framework::FeatureValue* d_vals, float* d_vals,
int* idx, int* idx,
long long len, long long len,
size_t val_size, size_t val_size,
const cudaStream_t& stream); const cudaStream_t& stream,
CommonFeatureValueAccessor& feature_value_accessor);
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -41,25 +41,16 @@ struct DynamicGradMerger { ...@@ -41,25 +41,16 @@ struct DynamicGradMerger {
return out; return out;
} }
template <typename T> template <typename FVAccessor>
__device__ __forceinline__ void update_one(T& output, const T& input) { __device__ __forceinline__ void update_one(
output.slot = input.slot; float* output, const float* input, FVAccessor& feature_value_accessor) {
output.show = input.show; feature_value_accessor.PushValueFill(output, input);
output.clk = input.clk;
output.mf_dim = input.mf_dim;
output.lr_g = input.lr_g;
for (int i = 0; i < output.mf_dim; ++i) {
output.mf_g[i] = input.mf_g[i];
}
} }
template <typename T>
__device__ __forceinline__ void merge_one(T& output, const T& input) { template <typename FVAccessor>
output.show += input.show; __device__ __forceinline__ void merge_one(
output.clk += input.clk; float* output, const float* input, FVAccessor& feature_value_accessor) {
output.lr_g += input.lr_g; feature_value_accessor.MergePushValue(output, input);
for (int i = 0; i < input.mf_dim; ++i) {
output.mf_g[i] += input.mf_g[i];
}
} }
}; };
...@@ -146,19 +137,20 @@ class HeterCommKernel { ...@@ -146,19 +137,20 @@ class HeterCommKernel {
bool debug_synchronous = false); bool debug_synchronous = false);
template <typename KeyType, template <typename KeyType,
typename GradType,
typename T, typename T,
typename StreamType> typename StreamType,
typename FVAccessor>
void dy_mf_fill_shard_grads(KeyType* d_shard_keys, void dy_mf_fill_shard_grads(KeyType* d_shard_keys,
KeyType* d_keys, KeyType* d_keys,
GradType* d_shard_grads, float* d_shard_grads,
GradType* d_grads, float* d_grads,
T* idx, T* idx,
long long len, long long len,
size_t grad_value_size, size_t grad_value_size,
const StreamType& stream); const StreamType& stream,
FVAccessor& feature_value_accessor);
template <typename StreamType> template <typename StreamType, typename FVAccessor>
void merge_gradient(const uint32_t* offset, void merge_gradient(const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
...@@ -167,15 +159,17 @@ class HeterCommKernel { ...@@ -167,15 +159,17 @@ class HeterCommKernel {
int n, int n,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_, DynamicGradMerger& merger_,
const StreamType& stream); const StreamType& stream,
FVAccessor& feature_value_accessor);
template <typename ValType, typename T, typename StreamType> template <typename T, typename StreamType, typename FVAccessor>
void dy_mf_fill_dvals(ValType* d_shard_vals, void dy_mf_fill_dvals(float* d_shard_vals,
ValType* d_vals, float* d_vals,
T* idx, T* idx,
long long len, long long len,
size_t val_size, size_t val_size,
const StreamType& stream); const StreamType& stream,
FVAccessor& feature_value_accessor);
private: private:
int block_size_{256}; int block_size_{256};
......
...@@ -22,34 +22,43 @@ namespace paddle { ...@@ -22,34 +22,43 @@ namespace paddle {
namespace framework { namespace framework {
HeterPsBase* HeterPsBase::get_instance( HeterPsBase* HeterPsBase::get_instance(
size_t capacity, std::shared_ptr<HeterPsResource> resource) { size_t capacity,
return new HeterPs(capacity, resource); std::shared_ptr<HeterPsResource> resource,
std::unordered_map<std::string, float> fleet_config,
std::string accessor_type,
int optimizer_type) {
if (accessor_type == "CtrDymfAccessor" &&
(optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) {
return new HeterPs<CommonFeatureValueAccessor>(
capacity, resource, accessor_type, fleet_config, optimizer_type);
} else {
VLOG(0) << " HeterPsBase get_instance Warning: now only support "
"CtrDymfAccessor, but get "
<< accessor_type_;
return new HeterPs<CommonFeatureValueAccessor>(
capacity, resource, accessor_type, fleet_config, optimizer_type);
}
} }
HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) { HeterPs::HeterPs(size_t capacity,
comm_ = std::shared_ptr<HeterPsResource> resource,
std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>( std::unordered_map<std::string, float> fleet_config,
capacity, resource); std::string accessor_type,
int optimizer_type) {
comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
capacity, resource);
optimizer_type_ = optimizer_type;
} }
HeterPs::~HeterPs() {} HeterPs::~HeterPs() {}
void HeterPs::pull_sparse(int num, void HeterPs::pull_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeatureValue* d_vals, float* d_vals,
size_t len) { size_t len) {
comm_->pull_sparse(num, d_keys, d_vals, len); comm_->pull_sparse(num, d_keys, d_vals, len);
} }
void HeterPs::build_ps(int num,
FeatureKey* h_keys,
FeatureValue* h_vals,
size_t len,
size_t chunk_size,
int stream_num) {
comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num);
}
int HeterPs::get_index_by_devid(int devid) { int HeterPs::get_index_by_devid(int devid) {
return comm_->get_index_by_devid(devid); return comm_->get_index_by_devid(devid);
} }
...@@ -68,7 +77,7 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } ...@@ -68,7 +77,7 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
void HeterPs::push_sparse(int num, void HeterPs::push_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeaturePushValue* d_grads, float* d_grads,
size_t len) { size_t len) {
comm_->push_sparse(num, d_keys, d_grads, len); comm_->push_sparse(num, d_keys, d_grads, len);
// comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
......
...@@ -22,80 +22,139 @@ namespace paddle { ...@@ -22,80 +22,139 @@ namespace paddle {
namespace framework { namespace framework {
HeterPsBase* HeterPsBase::get_instance( HeterPsBase* HeterPsBase::get_instance(
size_t capacity, std::shared_ptr<HeterPsResource> resource) { size_t capacity,
return new HeterPs(capacity, resource); std::shared_ptr<HeterPsResource> resource,
std::unordered_map<std::string, float> fleet_config,
std::string accessor_type,
int optimizer_type) {
if (accessor_type == "CtrDymfAccessor" &&
(optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) {
return new HeterPs<CommonFeatureValueAccessor>(
capacity, resource, fleet_config, accessor_type, optimizer_type);
} else {
VLOG(0) << " HeterPsBase get_instance Warning: now only support "
"CtrDymfAccessor, but get "
<< accessor_type;
return new HeterPs<CommonFeatureValueAccessor>(
capacity, resource, fleet_config, accessor_type, optimizer_type);
}
} }
HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) { template <typename FVAccessor>
comm_ = HeterPs<FVAccessor>::HeterPs(
std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>( size_t capacity,
capacity, resource); std::shared_ptr<HeterPsResource> resource,
opt_ = Optimizer<FeatureValue, FeaturePushValue>(); std::unordered_map<std::string, float> fleet_config,
std::string accessor_type,
int optimizer_type) {
comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
capacity, resource);
feature_value_accessor_.Configure(fleet_config);
set_accessor(feature_value_accessor_);
accessor_type_ = accessor_type;
optimizer_type_ = optimizer_type;
} }
HeterPs::~HeterPs() {} template <typename FVAccessor>
HeterPs<FVAccessor>::~HeterPs() {}
void HeterPs::pull_sparse(int num, template <typename FVAccessor>
FeatureKey* d_keys, void HeterPs<FVAccessor>::pull_sparse(int num,
FeatureValue* d_vals, FeatureKey* d_keys,
size_t len) { float* d_vals,
size_t len) {
comm_->pull_sparse(num, d_keys, d_vals, len); comm_->pull_sparse(num, d_keys, d_vals, len);
} }
void HeterPs::build_ps(int num, template <typename FVAccessor>
FeatureKey* h_keys, void HeterPs<FVAccessor>::build_ps(int num,
FeatureValue* h_vals, FeatureKey* h_keys,
size_t len, char* pool,
size_t chunk_size, size_t len,
int stream_num) { size_t feature_value_size,
comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num); size_t chunk_size,
} int stream_num) {
void HeterPs::build_ps(int num,
FeatureKey* h_keys,
char* pool,
size_t len,
size_t feature_value_size,
size_t chunk_size,
int stream_num) {
comm_->build_ps( comm_->build_ps(
num, h_keys, pool, len, feature_value_size, chunk_size, stream_num); num, h_keys, pool, len, feature_value_size, chunk_size, stream_num);
} }
int HeterPs::get_index_by_devid(int devid) { template <typename FVAccessor>
int HeterPs<FVAccessor>::get_index_by_devid(int devid) {
return comm_->get_index_by_devid(devid); return comm_->get_index_by_devid(devid);
} }
void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) { template <typename FVAccessor>
void HeterPs<FVAccessor>::set_sparse_sgd(
const OptimizerConfig& optimizer_config) {
comm_->set_sparse_sgd(optimizer_config); comm_->set_sparse_sgd(optimizer_config);
} }
void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) { template <typename FVAccessor>
void HeterPs<FVAccessor>::set_embedx_sgd(
const OptimizerConfig& optimizer_config) {
comm_->set_embedx_sgd(optimizer_config); comm_->set_embedx_sgd(optimizer_config);
} }
void HeterPs::end_pass() { comm_->end_pass(); } template <typename FVAccessor>
void HeterPs<FVAccessor>::end_pass() {
comm_->end_pass();
}
void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } template <typename FVAccessor>
void HeterPs<FVAccessor>::show_one_table(int gpu_num) {
comm_->show_one_table(gpu_num);
}
void HeterPs::push_sparse(int num, template <typename FVAccessor>
FeatureKey* d_keys, void HeterPs<FVAccessor>::push_sparse(int num,
FeaturePushValue* d_grads, FeatureKey* d_keys,
size_t len) { float* d_grads,
comm_->push_sparse(num, d_keys, d_grads, len, opt_); size_t len) {
// comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); if (accessor_type_ == "CtrDymfAccessor") {
if (optimizer_type_ == 3) { // adam
auto optimizer = SparseAdamOptimizer(feature_value_accessor_);
VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():"
<< optimizer.EmbedDim();
comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
} else if (optimizer_type_ == 4) { // shared_adam
auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_);
VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():"
<< optimizer.EmbedDim();
comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
} else if (optimizer_type_ == 1) { // adagrad {
auto optimizer = SparseAdagradOptimizer(feature_value_accessor_);
VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():"
<< optimizer.EmbedDim();
comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
} else {
VLOG(0) << " push sparse Error: CtrDymfAccessor only support adagrad(1),"
"adam(3) or shared_adam(4), bug get optimizer type:"
<< optimizer_type_;
}
} else {
VLOG(0) << " push sparse Error: now only support CtrDymfAccessor, but get "
<< accessor_type_;
}
} }
void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms, template <typename FVAccessor>
const std::vector<ncclComm_t>& inter_comms, void HeterPs<FVAccessor>::set_nccl_comm_and_size(
int comm_size) { const std::vector<ncclComm_t>& inner_comms,
const std::vector<ncclComm_t>& inter_comms,
int comm_size) {
comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
} }
void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { template <typename FVAccessor>
void HeterPs<FVAccessor>::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim); comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim);
} }
template <typename FVAccessor>
void HeterPs<FVAccessor>::set_accessor(FVAccessor& accessor) {
comm_->set_accessor(accessor);
}
} // end namespace framework } // end namespace framework
} // end namespace paddle } // end namespace paddle
#endif #endif
...@@ -26,24 +26,23 @@ limitations under the License. */ ...@@ -26,24 +26,23 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename FVAccessor>
class HeterPs : public HeterPsBase { class HeterPs : public HeterPsBase {
public: public:
HeterPs() {} HeterPs() {}
HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource); HeterPs(size_t capacity,
std::shared_ptr<HeterPsResource> resource,
std::unordered_map<std::string, float> fleet_config,
std::string accessor_type,
int optimizer_type);
virtual ~HeterPs(); virtual ~HeterPs();
HeterPs(const HeterPs&) = delete; HeterPs(const HeterPs&) = delete;
HeterPs& operator=(const HeterPs&) = delete; HeterPs& operator=(const HeterPs&) = delete;
void pull_sparse(int num, void pull_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeatureValue* d_vals, float* d_vals,
size_t len) override; size_t len) override;
void build_ps(int num,
FeatureKey* h_keys,
FeatureValue* h_vals,
size_t len,
size_t chunk_size,
int stream_num) override;
void build_ps(int num, void build_ps(int num,
FeatureKey* h_keys, FeatureKey* h_keys,
char* pool, char* pool,
...@@ -56,6 +55,8 @@ class HeterPs : public HeterPsBase { ...@@ -56,6 +55,8 @@ class HeterPs : public HeterPsBase {
const std::vector<ncclComm_t>& inter_comms, const std::vector<ncclComm_t>& inter_comms,
int comm_size) override; int comm_size) override;
void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;
void set_accessor(FVAccessor& accessor);
#endif #endif
void set_sparse_sgd(const OptimizerConfig& optimizer_config) override; void set_sparse_sgd(const OptimizerConfig& optimizer_config) override;
...@@ -66,13 +67,15 @@ class HeterPs : public HeterPsBase { ...@@ -66,13 +67,15 @@ class HeterPs : public HeterPsBase {
void show_one_table(int gpu_num) override; void show_one_table(int gpu_num) override;
void push_sparse(int num, void push_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeaturePushValue* d_grads, float* d_grads,
size_t len) override; size_t len) override;
private: private:
std::shared_ptr<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>> comm_; std::shared_ptr<HeterComm<FeatureKey, float*, float*, FVAccessor>> comm_;
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
Optimizer<FeatureValue, FeaturePushValue> opt_; FVAccessor feature_value_accessor_;
std::string accessor_type_;
int optimizer_type_;
#endif #endif
}; };
......
...@@ -34,14 +34,8 @@ class HeterPsBase { ...@@ -34,14 +34,8 @@ class HeterPsBase {
virtual void pull_sparse(int num, virtual void pull_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeatureValue* d_vals, float* d_vals,
size_t len) = 0; size_t len) = 0;
virtual void build_ps(int num,
FeatureKey* h_keys,
FeatureValue* h_vals,
size_t len,
size_t chunk_size,
int stream_num) = 0;
virtual void build_ps(int num, virtual void build_ps(int num,
FeatureKey* h_keys, FeatureKey* h_keys,
char* pool, char* pool,
...@@ -56,19 +50,25 @@ class HeterPsBase { ...@@ -56,19 +50,25 @@ class HeterPsBase {
const std::vector<ncclComm_t>& inter_comms, const std::vector<ncclComm_t>& inter_comms,
int comm_size) = 0; int comm_size) = 0;
virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0; virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0;
#endif #endif
virtual void end_pass() = 0; virtual void end_pass() = 0;
virtual void show_one_table(int gpu_num) = 0; virtual void show_one_table(int gpu_num) = 0;
virtual void push_sparse(int num, virtual void push_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeaturePushValue* d_grads, float* d_grads,
size_t len) = 0; size_t len) = 0;
virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0; virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0;
virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0; virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0;
static HeterPsBase* get_instance(size_t capacity, static HeterPsBase* get_instance(
std::shared_ptr<HeterPsResource> resource); size_t capacity,
std::shared_ptr<HeterPsResource> resource,
// CommonFeatureValueAccessor feature_value_accessor,
std::unordered_map<std::string, float> fleet_config,
std::string accessor_type,
int optimizer_type);
}; };
} // end namespace framework } // end namespace framework
......
...@@ -82,20 +82,6 @@ class HBMMemoryPool : public managed { ...@@ -82,20 +82,6 @@ class HBMMemoryPool : public managed {
cudaMemset(mem_, 0, block_size_ * capacity); cudaMemset(mem_, 0, block_size_ * capacity);
} }
friend std::ostream& operator<<(std::ostream& out, HBMMemoryPool& p) {
for (size_t k = 0; k < 5; k++) {
auto x = (FeatureValue*)(p.mem() + k * p.capacity());
out << "show: " << x->show << " clk: " << x->clk << " slot: " << x->slot
<< " lr: " << x->lr << " mf_dim: " << x->mf_size
<< " mf_size: " << x->mf_size << " mf:";
for (int i = 0; i < x->mf_size + 1; ++i) {
out << " " << x->mf[i];
}
out << "\n";
}
return out;
}
char* mem() { return mem_; } char* mem() { return mem_; }
size_t capacity() { return capacity_; } size_t capacity() { return capacity_; }
......
...@@ -27,13 +27,19 @@ class OptimizerConfig { ...@@ -27,13 +27,19 @@ class OptimizerConfig {
float learning_rate = 0.05; float learning_rate = 0.05;
float initial_g2sum = 3.0; float initial_g2sum = 3.0;
float initial_range = 0; float initial_range = 0;
float beta1_decay_rate = 0.9; // adam
float beta2_decay_rate = 0.999; // adam
float ada_epsilon = 1e-8;
float mf_create_thresholds = 10; float mf_create_thresholds = 10;
float mf_learning_rate = 0.05; float mf_learning_rate = 0.05;
float mf_initial_g2sum = 3.0; float mf_initial_g2sum = 3.0;
float mf_initial_range = 1e-4; float mf_initial_range = 1e-4;
float mf_beta1_decay_rate = 0.9; // adam
float mf_beta2_decay_rate = 0.999; // adam
float mf_min_bound = -10; float mf_min_bound = -10;
float mf_max_bound = 10; float mf_max_bound = 10;
float mf_ada_epsilon = 1e-8;
void set_sparse_sgd(float nonclk_coeff, void set_sparse_sgd(float nonclk_coeff,
float clk_coeff, float clk_coeff,
...@@ -41,7 +47,10 @@ class OptimizerConfig { ...@@ -41,7 +47,10 @@ class OptimizerConfig {
float max_bound, float max_bound,
float learning_rate, float learning_rate,
float initial_g2sum, float initial_g2sum,
float initial_range) { float initial_range,
float beta1_decay_rate,
float beta2_decay_rate,
float ada_epsilon) {
this->nonclk_coeff = nonclk_coeff; this->nonclk_coeff = nonclk_coeff;
this->clk_coeff = clk_coeff; this->clk_coeff = clk_coeff;
this->min_bound = min_bound; this->min_bound = min_bound;
...@@ -49,6 +58,9 @@ class OptimizerConfig { ...@@ -49,6 +58,9 @@ class OptimizerConfig {
this->learning_rate = learning_rate; this->learning_rate = learning_rate;
this->initial_g2sum = initial_g2sum; this->initial_g2sum = initial_g2sum;
this->initial_range = initial_range; this->initial_range = initial_range;
this->beta1_decay_rate = beta1_decay_rate;
this->beta2_decay_rate = beta2_decay_rate;
this->ada_epsilon = ada_epsilon;
} }
void set_sparse_sgd(const OptimizerConfig& optimizer_config) { void set_sparse_sgd(const OptimizerConfig& optimizer_config) {
...@@ -59,6 +71,9 @@ class OptimizerConfig { ...@@ -59,6 +71,9 @@ class OptimizerConfig {
this->learning_rate = optimizer_config.learning_rate; this->learning_rate = optimizer_config.learning_rate;
this->initial_g2sum = optimizer_config.initial_g2sum; this->initial_g2sum = optimizer_config.initial_g2sum;
this->initial_range = optimizer_config.initial_range; this->initial_range = optimizer_config.initial_range;
this->beta1_decay_rate = optimizer_config.beta1_decay_rate;
this->beta2_decay_rate = optimizer_config.beta2_decay_rate;
this->ada_epsilon = optimizer_config.ada_epsilon;
} }
void set_embedx_sgd(float mf_create_thresholds, void set_embedx_sgd(float mf_create_thresholds,
...@@ -66,13 +81,19 @@ class OptimizerConfig { ...@@ -66,13 +81,19 @@ class OptimizerConfig {
float mf_initial_g2sum, float mf_initial_g2sum,
float mf_initial_range, float mf_initial_range,
float mf_min_bound, float mf_min_bound,
float mf_max_bound) { float mf_max_bound,
float mf_beta1_decay_rate,
float mf_beta2_decay_rate,
float mf_ada_epsilon) {
this->mf_create_thresholds = mf_create_thresholds; this->mf_create_thresholds = mf_create_thresholds;
this->mf_learning_rate = mf_learning_rate; this->mf_learning_rate = mf_learning_rate;
this->mf_initial_g2sum = mf_initial_g2sum; this->mf_initial_g2sum = mf_initial_g2sum;
this->mf_initial_range = mf_initial_range; this->mf_initial_range = mf_initial_range;
this->mf_min_bound = mf_min_bound; this->mf_min_bound = mf_min_bound;
this->mf_max_bound = mf_max_bound; this->mf_max_bound = mf_max_bound;
this->mf_beta1_decay_rate = mf_beta1_decay_rate;
this->mf_beta2_decay_rate = mf_beta2_decay_rate;
this->mf_ada_epsilon = mf_ada_epsilon;
} }
void set_embedx_sgd(const OptimizerConfig& optimizer_config) { void set_embedx_sgd(const OptimizerConfig& optimizer_config) {
...@@ -82,6 +103,9 @@ class OptimizerConfig { ...@@ -82,6 +103,9 @@ class OptimizerConfig {
this->mf_initial_range = optimizer_config.mf_initial_range; this->mf_initial_range = optimizer_config.mf_initial_range;
this->mf_min_bound = optimizer_config.mf_min_bound; this->mf_min_bound = optimizer_config.mf_min_bound;
this->mf_max_bound = optimizer_config.mf_max_bound; this->mf_max_bound = optimizer_config.mf_max_bound;
this->mf_beta1_decay_rate = optimizer_config.mf_beta1_decay_rate;
this->mf_beta2_decay_rate = optimizer_config.mf_beta2_decay_rate;
this->mf_ada_epsilon = optimizer_config.mf_ada_epsilon;
} }
}; };
......
...@@ -26,90 +26,6 @@ limitations under the License. */ ...@@ -26,90 +26,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
__global__ void PullCopy(float** dest,
const FeatureValue* src,
const int64_t* len,
int hidden,
int slot_num,
int total_len,
uint64_t** keys) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < len[mid])
high = mid;
else
low = mid + 1;
}
int x = low;
int y = i - (x ? len[x - 1] : 0);
if (*(keys[x] + y) == 0) {
*(dest[x] + y * hidden) = 0;
*(dest[x] + y * hidden + 1) = 0;
*(dest[x] + y * hidden + 2) = 0;
} else {
*(dest[x] + y * hidden) = (src + i)->show;
*(dest[x] + y * hidden + 1) = (src + i)->clk;
*(dest[x] + y * hidden + 2) = (src + i)->lr;
}
if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) {
for (int j = 0; j < hidden - 3; j++) {
*(dest[x] + y * hidden + 3 + j) = 0;
}
} else {
for (int j = 0; j < hidden - 3; j++) {
*(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j];
}
}
}
}
__global__ void PullCopy(float** dest,
const FeatureValue* src,
const int64_t* len,
int slot_num,
int total_len,
uint64_t** keys,
uint64_t max_val_size,
int* gpu_dim) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < len[mid])
high = mid;
else
low = mid + 1;
}
int x = low;
int y = i - (x ? len[x - 1] : 0);
FeatureValue* feature_value_ptr =
(FeatureValue*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
int mf_dim = gpu_dim[x] - 3;
if (*(keys[x] + y) == 0) {
*(dest[x] + y * (mf_dim + 3)) = 0;
*(dest[x] + y * (mf_dim + 3) + 1) = 0;
*(dest[x] + y * (mf_dim + 3) + 2) = 0;
} else {
*(dest[x] + y * (mf_dim + 3)) = feature_value_ptr->show;
*(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr->clk;
*(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr->lr;
}
if ((feature_value_ptr)->mf_size == 0 || *(keys[x] + y) == 0) {
for (int j = 0; j < mf_dim; j++) {
*(dest[x] + y * (mf_dim + 3) + 3 + j) = 0;
}
} else {
for (int j = 0; j < mf_dim; j++) {
*(dest[x] + y * (mf_dim + 3) + 3 + j) = feature_value_ptr->mf[1 + j];
}
}
}
}
__global__ void CopyKeysKernel(uint64_t** src_keys, __global__ void CopyKeysKernel(uint64_t** src_keys,
uint64_t* dest_total_keys, uint64_t* dest_total_keys,
const int64_t* len, const int64_t* len,
...@@ -161,101 +77,8 @@ __global__ void PushCopy(FeaturePushValue* dest, ...@@ -161,101 +77,8 @@ __global__ void PushCopy(FeaturePushValue* dest,
} }
} }
__global__ void PushCopyWithPool(FeaturePushValue* dest,
float** src,
int64_t* len,
int slot_num,
uint64_t total_len,
int bs,
int* slot_vector,
int* mf_dim_vector,
size_t grad_value_size) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < len[mid])
high = mid;
else
low = mid + 1;
}
int x = low;
int y = i - (x ? len[low - 1] : 0);
FeaturePushValue* cur =
(FeaturePushValue*)((char*)dest + i * grad_value_size);
cur->slot = slot_vector[x];
int mf_dim = mf_dim_vector[x];
cur->mf_dim = mf_dim;
cur->show = *(src[x] + y * (mf_dim + 3));
cur->clk = *(src[x] + y * (mf_dim + 3) + 1);
cur->lr_g = *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
for (int j = 0; j < cur->mf_dim; j++) {
cur->mf_g[j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
}
}
}
PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const FeatureValue* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
cudaMemcpy(gpu_values,
values.data(),
values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
gpu_values,
total_values_gpu,
gpu_len,
hidden_size,
slot_num,
total_length,
gpu_keys);
cudaStreamSynchronize(stream);
}
void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const FeatureValue* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length,
int* gpu_dim) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
cudaMemcpy(gpu_values,
values.data(),
values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
gpu_values,
total_values_gpu,
gpu_len,
slot_num,
total_length,
gpu_keys,
val_type_size_,
gpu_dim);
cudaStreamSynchronize(stream);
}
void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
uint64_t** origin_keys, uint64_t** origin_keys,
uint64_t* total_keys, uint64_t* total_keys,
...@@ -270,125 +93,26 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, ...@@ -270,125 +93,26 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
} }
void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
FeaturePushValue* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const int hidden_size,
const int64_t total_length,
const int batch_size) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto slot_lengths_lod = slot_lengths;
for (int i = 1; i < slot_lengths_lod.size(); i++) {
slot_lengths_lod[i] += slot_lengths_lod[i - 1];
}
auto buf_grad_value =
memory::Alloc(place, grad_values.size() * sizeof(float*));
auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
auto buf_slot_vector =
memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
cudaMemcpy(gpu_values,
grad_values.data(),
grad_values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_len,
slot_lengths_lod.data(),
slot_lengths.size() * sizeof(int64_t),
cudaMemcpyHostToDevice);
cudaMemcpy(d_slot_vector,
slot_vector_.data(),
slot_lengths_lod.size() * sizeof(int),
cudaMemcpyHostToDevice);
PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
total_grad_values_gpu,
gpu_values,
gpu_len,
hidden_size,
slot_lengths.size(),
total_length,
batch_size,
d_slot_vector);
cudaStreamSynchronize(stream);
}
void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
FeaturePushValue* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const uint64_t total_length,
const int batch_size,
size_t grad_value_size) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto slot_lengths_lod = slot_lengths;
for (int i = 1; i < slot_lengths_lod.size(); i++) {
slot_lengths_lod[i] += slot_lengths_lod[i - 1];
}
auto buf_grad_value =
memory::Alloc(place, grad_values.size() * sizeof(float*));
auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
auto buf_slot_vector =
memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
auto buf_mf_dim_vector =
memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
cudaMemcpy(gpu_values,
grad_values.data(),
grad_values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_len,
slot_lengths_lod.data(),
slot_lengths.size() * sizeof(int64_t),
cudaMemcpyHostToDevice);
cudaMemcpy(d_slot_vector,
slot_vector_.data(),
slot_lengths_lod.size() * sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(d_mf_dim_vector,
slot_mf_dim_vector_.data(),
slot_lengths_lod.size() * sizeof(int),
cudaMemcpyHostToDevice);
PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
total_grad_values_gpu,
gpu_values,
gpu_len,
slot_lengths.size(),
total_length,
batch_size,
d_slot_vector,
d_mf_dim_vector,
grad_value_size);
cudaStreamSynchronize(stream);
}
void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, void PSGPUWrapper::SetSparseSGD(float nonclk_coeff,
float clk_coeff, float clk_coeff,
float min_bound, float min_bound,
float max_bound, float max_bound,
float learning_rate, float learning_rate,
float initial_g2sum, float initial_g2sum,
float initial_range) { float initial_range,
OptimizerConfig optimizer_config; float beta1_decay_rate,
optimizer_config.set_sparse_sgd(nonclk_coeff, float beta2_decay_rate,
clk_coeff, float ada_epsilon) {
min_bound, optimizer_config_.set_sparse_sgd(nonclk_coeff,
max_bound, clk_coeff,
learning_rate, min_bound,
initial_g2sum, max_bound,
initial_range); learning_rate,
HeterPs_->set_sparse_sgd(optimizer_config); initial_g2sum,
initial_range,
beta1_decay_rate,
beta2_decay_rate,
ada_epsilon);
} }
void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
...@@ -396,15 +120,19 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, ...@@ -396,15 +120,19 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float mf_initial_g2sum, float mf_initial_g2sum,
float mf_initial_range, float mf_initial_range,
float mf_min_bound, float mf_min_bound,
float mf_max_bound) { float mf_max_bound,
OptimizerConfig optimizer_config; float mf_beta1_decay_rate,
optimizer_config.set_embedx_sgd(mf_create_thresholds, float mf_beta2_decay_rate,
mf_learning_rate, float mf_ada_epsilon) {
mf_initial_g2sum, optimizer_config_.set_embedx_sgd(mf_create_thresholds,
mf_initial_range, mf_learning_rate,
mf_min_bound, mf_initial_g2sum,
mf_max_bound); mf_initial_range,
HeterPs_->set_embedx_sgd(optimizer_config); mf_min_bound,
mf_max_bound,
mf_beta1_decay_rate,
mf_beta2_decay_rate,
mf_ada_epsilon);
} }
} // end namespace framework } // end namespace framework
......
...@@ -51,7 +51,10 @@ limitations under the License. */ ...@@ -51,7 +51,10 @@ limitations under the License. */
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_PSCORE #ifdef PADDLE_WITH_PSCORE
#include "paddle/fluid/distributed/ps/table/accessor.h"
#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/distributed/the_one_ps.pb.h"
#endif #endif
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
#include "afs_api.h" #include "afs_api.h"
...@@ -64,9 +67,6 @@ limitations under the License. */ ...@@ -64,9 +67,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
class Dataset; class Dataset;
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
...@@ -98,7 +98,7 @@ class AfsWrapper { ...@@ -98,7 +98,7 @@ class AfsWrapper {
class PSGPUWrapper { class PSGPUWrapper {
public: public:
virtual ~PSGPUWrapper(); ~PSGPUWrapper();
PSGPUWrapper() { PSGPUWrapper() {
HeterPs_ = NULL; HeterPs_ = NULL;
...@@ -139,37 +139,6 @@ class PSGPUWrapper { ...@@ -139,37 +139,6 @@ class PSGPUWrapper {
const int64_t* gpu_len, const int64_t* gpu_len,
int slot_num, int slot_num,
int total_len); int total_len);
void CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const FeatureValue* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length);
void CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const FeatureValue* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length,
int* gpu_dim);
void CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
FeaturePushValue* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const int hidden_size,
const int64_t total_length,
const int batch_size);
void CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
FeaturePushValue* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const uint64_t total_length,
const int batch_size,
size_t grad_value_size);
void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task); void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
void PreBuildTask(std::shared_ptr<HeterContext> gpu_task); void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
...@@ -274,13 +243,96 @@ class PSGPUWrapper { ...@@ -274,13 +243,96 @@ class PSGPUWrapper {
float max_bound, float max_bound,
float learning_rate, float learning_rate,
float initial_g2sum, float initial_g2sum,
float initial_range); float initial_range,
float beta1_decay_rate,
float beta2_decay_rate,
float ada_epsilon);
void SetEmbedxSGD(float mf_create_thresholds, void SetEmbedxSGD(float mf_create_thresholds,
float mf_learning_rate, float mf_learning_rate,
float mf_initial_g2sum, float mf_initial_g2sum,
float mf_initial_range, float mf_initial_range,
float mf_min_bound, float mf_min_bound,
float mf_max_bound); float mf_max_bound,
float mf_beta1_decay_rate,
float mf_beta2_decay_rate,
float mf_ada_epsilon);
#ifdef PADDLE_WITH_PSCORE
void add_sparse_optimizer(
std::unordered_map<std::string, float>& config, // NOLINT
const ::paddle::distributed::SparseCommonSGDRuleParameter& sgd_param,
const std::string& prefix = "") {
auto optimizer_name = sgd_param.name();
if (optimizer_name == "SparseNaiveSGDRule") {
config[prefix + "optimizer_type"] = 0;
config[prefix + "learning_rate"] = sgd_param.naive().learning_rate();
config[prefix + "initial_range"] = sgd_param.naive().initial_range();
config[prefix + "min_bound"] = sgd_param.naive().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.naive().weight_bounds()[1];
} else if (optimizer_name == "SparseAdaGradSGDRule") {
config[prefix + "optimizer_type"] = 1;
config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
} else if (optimizer_name == "StdAdaGradSGDRule") {
config[prefix + "optimizer_type"] = 2;
config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
} else if (optimizer_name == "SparseAdamSGDRule") {
config[prefix + "optimizer_type"] = 3;
config[prefix + "learning_rate"] = sgd_param.adam().learning_rate();
config[prefix + "initial_range"] = sgd_param.adam().initial_range();
config[prefix + "beta1_decay_rate"] = sgd_param.adam().beta1_decay_rate();
config[prefix + "beta2_decay_rate"] = sgd_param.adam().beta2_decay_rate();
config[prefix + "ada_epsilon"] = sgd_param.adam().ada_epsilon();
config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1];
} else if (optimizer_name == "SparseSharedAdamSGDRule") {
config[prefix + "optimizer_type"] = 4;
config[prefix + "learning_rate"] = sgd_param.adam().learning_rate();
config[prefix + "initial_range"] = sgd_param.adam().initial_range();
config[prefix + "beta1_decay_rate"] = sgd_param.adam().beta1_decay_rate();
config[prefix + "beta2_decay_rate"] = sgd_param.adam().beta2_decay_rate();
config[prefix + "ada_epsilon"] = sgd_param.adam().ada_epsilon();
config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1];
}
}
void InitializeGPUServer(paddle::distributed::PSParameter ps_param) {
auto sparse_table =
ps_param.server_param().downpour_server_param().downpour_table_param(0);
auto sparse_table_accessor = sparse_table.accessor();
auto sparse_table_accessor_parameter =
sparse_table_accessor.ctr_accessor_param();
accessor_class_ = sparse_table_accessor.accessor_class();
std::unordered_map<std::string, float> config;
config["embedx_dim"] = sparse_table_accessor.embedx_dim();
config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
if (accessor_class_ == "CtrDymfAccessor") {
// optimizer config for embed_w and embedx
add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
add_sparse_optimizer(
config, sparse_table_accessor.embedx_sgd_param(), "mf_");
}
fleet_config_ = config;
GlobalAccessorTransfor::GetInstance().Init(accessor_class_);
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper()->Configure(
config);
InitializeGPUServer(config);
}
#endif
void InitializeGPUServer(std::unordered_map<std::string, float> config) { void InitializeGPUServer(std::unordered_map<std::string, float> config) {
float nonclk_coeff = (config.find("nonclk_coeff") == config.end()) float nonclk_coeff = (config.find("nonclk_coeff") == config.end())
? 1.0 ? 1.0
...@@ -288,54 +340,83 @@ class PSGPUWrapper { ...@@ -288,54 +340,83 @@ class PSGPUWrapper {
float clk_coeff = float clk_coeff =
(config.find("clk_coeff") == config.end()) ? 1.0 : config["clk_coeff"]; (config.find("clk_coeff") == config.end()) ? 1.0 : config["clk_coeff"];
float min_bound = (config.find("min_bound") == config.end()) float min_bound = (config.find("min_bound") == config.end())
? -10000.0 ? -10.0
: config["min_bound"]; : config["min_bound"];
float max_bound = (config.find("max_bound") == config.end()) float max_bound =
? 10000.0 (config.find("max_bound") == config.end()) ? 10.0 : config["max_bound"];
: config["max_bound"];
float learning_rate = (config.find("learning_rate") == config.end()) float learning_rate = (config.find("learning_rate") == config.end())
? 1.0 ? 0.05
: config["learning_rate"]; : config["learning_rate"];
float initial_g2sum = (config.find("initial_g2sum") == config.end()) float initial_g2sum = (config.find("initial_g2sum") == config.end())
? 1.0 ? 3.0
: config["initial_g2sum"]; : config["initial_g2sum"];
float initial_range = (config.find("initial_range") == config.end()) float initial_range = (config.find("initial_range") == config.end())
? 1.0 ? 1e-4
: config["initial_range"]; : config["initial_range"];
float beta1_decay_rate = (config.find("beta1_decay_rate") == config.end())
? 0.9
: config["beta1_decay_rate"];
float beta2_decay_rate = (config.find("beta2_decay_rate") == config.end())
? 0.999
: config["beta2_decay_rate"];
float ada_epsilon = (config.find("ada_epsilon") == config.end())
? 1e-8
: config["ada_epsilon"];
// mf config settings // mf config settings
float mf_create_thresholds = float mf_create_thresholds =
(config.find("mf_create_thresholds") == config.end()) (config.find("mf_create_thresholds") == config.end())
? static_cast<float>(1.0) ? static_cast<float>(1.0)
: config["mf_create_thresholds"]; : config["mf_create_thresholds"];
float mf_learning_rate = (config.find("mf_learning_rate") == config.end()) float mf_learning_rate = (config.find("mf_learning_rate") == config.end())
? 1.0 ? 0.05
: config["mf_learning_rate"]; : config["mf_learning_rate"];
float mf_initial_g2sum = (config.find("mf_initial_g2sum") == config.end()) float mf_initial_g2sum = (config.find("mf_initial_g2sum") == config.end())
? 1.0 ? 3.0
: config["mf_initial_g2sum"]; : config["mf_initial_g2sum"];
float mf_initial_range = (config.find("mf_initial_range") == config.end()) float mf_initial_range = (config.find("mf_initial_range") == config.end())
? 1.0 ? 1e-4
: config["mf_initial_range"]; : config["mf_initial_range"];
float mf_min_bound = (config.find("mf_min_bound") == config.end()) float mf_min_bound = (config.find("mf_min_bound") == config.end())
? 1.0 ? -10.0
: config["mf_min_bound"]; : config["mf_min_bound"];
float mf_max_bound = (config.find("mf_max_bound") == config.end()) float mf_max_bound = (config.find("mf_max_bound") == config.end())
? 1.0 ? 10.0
: config["mf_max_bound"]; : config["mf_max_bound"];
float mf_beta1_decay_rate =
(config.find("mf_beta1_decay_rate") == config.end())
? 0.9
: config["mf_beta1_decay_rate"];
float mf_beta2_decay_rate =
(config.find("mf_beta2_decay_rate") == config.end())
? 0.999
: config["mf_beta2_decay_rate"];
float mf_ada_epsilon = (config.find("mf_ada_epsilon") == config.end())
? 1e-8
: config["mf_ada_epsilon"];
this->SetSparseSGD(nonclk_coeff, this->SetSparseSGD(nonclk_coeff,
clk_coeff, clk_coeff,
min_bound, min_bound,
max_bound, max_bound,
learning_rate, learning_rate,
initial_g2sum, initial_g2sum,
initial_range); initial_range,
beta1_decay_rate,
beta2_decay_rate,
ada_epsilon);
this->SetEmbedxSGD(mf_create_thresholds, this->SetEmbedxSGD(mf_create_thresholds,
mf_learning_rate, mf_learning_rate,
mf_initial_g2sum, mf_initial_g2sum,
mf_initial_range, mf_initial_range,
mf_min_bound, mf_min_bound,
mf_max_bound); mf_max_bound,
mf_beta1_decay_rate,
mf_beta2_decay_rate,
mf_ada_epsilon);
// set optimizer type(naive,adagrad,std_adagrad,adam,share_adam)
optimizer_type_ = (config.find("optimizer_type") == config.end())
? 1
: static_cast<int>(config["optimizer_type"]);
} }
void SetDate(int year, int month, int day) { void SetDate(int year, int month, int day) {
...@@ -380,7 +461,7 @@ class PSGPUWrapper { ...@@ -380,7 +461,7 @@ class PSGPUWrapper {
if (slot_info_initialized_) { if (slot_info_initialized_) {
return; return;
} }
SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_); SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
auto slots_vec = dataset->GetSlots(); auto slots_vec = dataset->GetSlots();
slot_offset_vector_.clear(); slot_offset_vector_.clear();
for (auto& slot : slot_vector_) { for (auto& slot : slot_vector_) {
...@@ -421,10 +502,13 @@ class PSGPUWrapper { ...@@ -421,10 +502,13 @@ class PSGPUWrapper {
for (size_t i = 0; i < slot_index_vec_.size(); i++) { for (size_t i = 0; i < slot_index_vec_.size(); i++) {
slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]]; slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]];
} }
val_type_size_ =
TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); auto accessor_wrapper_ptr =
grad_type_size_ = GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_
<< " grad_type_size_:" << grad_type_size_;
slot_info_initialized_ = true; slot_info_initialized_ = true;
} }
#endif #endif
...@@ -445,6 +529,12 @@ class PSGPUWrapper { ...@@ -445,6 +529,12 @@ class PSGPUWrapper {
const std::string& conf); const std::string& conf);
#endif #endif
#ifdef PADDLE_WITH_PSCORE
void SetTableAccessor(paddle::distributed::ValueAccessor* accessor) {
cpu_table_accessor_ = accessor;
}
#endif
private: private:
static std::shared_ptr<PSGPUWrapper> s_instance_; static std::shared_ptr<PSGPUWrapper> s_instance_;
Dataset* dataset_; Dataset* dataset_;
...@@ -497,6 +587,12 @@ class PSGPUWrapper { ...@@ -497,6 +587,12 @@ class PSGPUWrapper {
int day_; int day_;
bool slot_info_initialized_ = false; bool slot_info_initialized_ = false;
int use_afs_api_ = 0; int use_afs_api_ = 0;
int optimizer_type_ = 1;
std::string accessor_class_;
std::unordered_map<std::string, float> fleet_config_;
#ifdef PADDLE_WITH_PSCORE
paddle::distributed::ValueAccessor* cpu_table_accessor_;
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<MemoryPool*> mem_pools_; std::vector<MemoryPool*> mem_pools_;
...@@ -521,6 +617,7 @@ class PSGPUWrapper { ...@@ -521,6 +617,7 @@ class PSGPUWrapper {
bool running_ = false; bool running_ = false;
std::vector<std::shared_ptr<ThreadPool>> pull_thread_pool_; std::vector<std::shared_ptr<ThreadPool>> pull_thread_pool_;
std::vector<std::shared_ptr<ThreadPool>> hbm_thread_pool_; std::vector<std::shared_ptr<ThreadPool>> hbm_thread_pool_;
OptimizerConfig optimizer_config_;
protected: protected:
static bool is_initialized_; static bool is_initialized_;
......
...@@ -594,6 +594,21 @@ class DistributedStrategy(object): ...@@ -594,6 +594,21 @@ class DistributedStrategy(object):
bounds = strategy.get(prefix + 'sparse_weight_bounds', bounds = strategy.get(prefix + 'sparse_weight_bounds',
[-10, 10]) [-10, 10])
sgd.adam.weight_bounds.extend(bounds) sgd.adam.weight_bounds.extend(bounds)
elif optimizer_name == "shared_adam":
sgd.name = 'SparseSharedAdamSGDRule'
sgd.adam.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.001)
sgd.adam.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4)
sgd.adam.beta1_decay_rate = strategy.get(
prefix + 'sparse_beta1_decay_rate', 0.9)
sgd.adam.beta2_decay_rate = strategy.get(
prefix + 'sparse_beta2_decay_rate', 0.999)
sgd.adam.ada_epsilon = strategy.get(
prefix + 'sparse_ada_epsilon', 1e-8)
bounds = strategy.get(prefix + 'sparse_weight_bounds',
[-10, 10])
sgd.adam.weight_bounds.extend(bounds)
def set_sparse_table_config(table_data, config): def set_sparse_table_config(table_data, config):
for key in config: for key in config:
......
...@@ -195,7 +195,7 @@ class Accessor: ...@@ -195,7 +195,7 @@ class Accessor:
sgd_param.naive.initial_range = 0.0001 sgd_param.naive.initial_range = 0.0001
if len(sgd_param.naive.weight_bounds) == 0: if len(sgd_param.naive.weight_bounds) == 0:
sgd_param.naive.weight_bounds.extend([-10.0, 10.0]) sgd_param.naive.weight_bounds.extend([-10.0, 10.0])
if sgd_param.name == "SparseAdamSGDRule": if sgd_param.name == "SparseAdamSGDRule" or sgd_param.name == "SparseSharedAdamSGDRule":
if not sgd_param.adam.HasField("learning_rate"): if not sgd_param.adam.HasField("learning_rate"):
sgd_param.adam.learning_rate = 0.001 sgd_param.adam.learning_rate = 0.001
if not sgd_param.adam.HasField("initial_range"): if not sgd_param.adam.HasField("initial_range"):
......
...@@ -334,6 +334,14 @@ class TestStrategyConfig(unittest.TestCase): ...@@ -334,6 +334,14 @@ class TestStrategyConfig(unittest.TestCase):
strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad. strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad.
initial_range, 0.0001) initial_range, 0.0001)
strategy = paddle.distributed.fleet.DistributedStrategy()
configs = {}
configs['emb'] = {"sparse_optimizer": "shared_adam"}
strategy.fleet_desc_configs = configs
self.assertEqual(
strategy.sparse_table_configs[0].accessor.embed_sgd_param.adam.
beta1_decay_rate, 0.9)
def test_trainer_desc_configs(self): def test_trainer_desc_configs(self):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
configs = { configs = {
......
...@@ -671,7 +671,8 @@ HIGH_PARALLEL_JOB_NEW = [ ...@@ -671,7 +671,8 @@ HIGH_PARALLEL_JOB_NEW = [
'test_trt_convert_reduce_sum', 'test_trt_convert_reduce_sum',
'save_quant2_model_lstm', 'save_quant2_model_lstm',
'test_trt_convert_slice', 'test_trt_convert_slice',
'test_quant2_int8_lstm_mkldnn' 'test_quant2_int8_lstm_mkldnn',
'test_dist_fleet_ps13'
] ]
# mem=0 but always timeout or failed : It run 15 job each time in Single cases; # mem=0 but always timeout or failed : It run 15 job each time in Single cases;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册