未验证 提交 b8d106e1 编写于 作者: D danleifeng 提交者: GitHub

【GPUPS】Adam accessor (#43919)

* add adam/sharedadam optimzier for gpups;edit optimizer struct;test=develop
上级 1882ffd5
...@@ -31,6 +31,7 @@ int CtrDymfAccessor::Initialize() { ...@@ -31,6 +31,7 @@ int CtrDymfAccessor::Initialize() {
_embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name); _embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name);
_embedx_sgd_rule->LoadConfig(_config.embedx_sgd_param(), _embedx_sgd_rule->LoadConfig(_config.embedx_sgd_param(),
_config.embedx_dim()); _config.embedx_dim());
common_feature_value.optimizer_name = name;
common_feature_value.embed_sgd_dim = _embed_sgd_rule->Dim(); common_feature_value.embed_sgd_dim = _embed_sgd_rule->Dim();
common_feature_value.embedx_dim = _config.embedx_dim(); common_feature_value.embedx_dim = _config.embedx_dim();
...@@ -42,7 +43,10 @@ int CtrDymfAccessor::Initialize() { ...@@ -42,7 +43,10 @@ int CtrDymfAccessor::Initialize() {
if (_config.ctr_accessor_param().show_scale()) { if (_config.ctr_accessor_param().show_scale()) {
_show_scale = true; _show_scale = true;
} }
VLOG(0) << " INTO CtrDymfAccessor::Initialize()"; VLOG(0) << " INTO CtrDymfAccessor::Initialize(); embed_sgd_dim:"
<< common_feature_value.embed_sgd_dim
<< " embedx_dim:" << common_feature_value.embedx_dim
<< " embedx_sgd_dim:" << common_feature_value.embedx_sgd_dim;
InitAccessorInfo(); InitAccessorInfo();
return 0; return 0;
} }
...@@ -53,9 +57,9 @@ void CtrDymfAccessor::InitAccessorInfo() { ...@@ -53,9 +57,9 @@ void CtrDymfAccessor::InitAccessorInfo() {
auto embedx_dim = _config.embedx_dim(); auto embedx_dim = _config.embedx_dim();
VLOG(0) << "InitAccessorInfo embedx_dim:" << embedx_dim; VLOG(0) << "InitAccessorInfo embedx_dim:" << embedx_dim;
_accessor_info.select_dim = 3 + embedx_dim; _accessor_info.select_dim = 4 + embedx_dim;
_accessor_info.select_size = _accessor_info.select_dim * sizeof(float); _accessor_info.select_size = _accessor_info.select_dim * sizeof(float);
_accessor_info.update_dim = 4 + embedx_dim; _accessor_info.update_dim = 5 + embedx_dim;
_accessor_info.update_size = _accessor_info.update_dim * sizeof(float); _accessor_info.update_size = _accessor_info.update_dim * sizeof(float);
_accessor_info.mf_size = _accessor_info.mf_size =
(embedx_dim + common_feature_value.embedx_sgd_dim) * sizeof(float); (embedx_dim + common_feature_value.embedx_sgd_dim) * sizeof(float);
...@@ -179,8 +183,10 @@ int32_t CtrDymfAccessor::Create(float** values, size_t num) { ...@@ -179,8 +183,10 @@ int32_t CtrDymfAccessor::Create(float** values, size_t num) {
value[common_feature_value.ClickIndex()] = 0; value[common_feature_value.ClickIndex()] = 0;
value[common_feature_value.SlotIndex()] = -1; value[common_feature_value.SlotIndex()] = -1;
value[common_feature_value.MfDimIndex()] = -1; value[common_feature_value.MfDimIndex()] = -1;
_embed_sgd_rule->InitValue(value + common_feature_value.EmbedWIndex(), _embed_sgd_rule->InitValue(
value + common_feature_value.EmbedG2SumIndex()); value + common_feature_value.EmbedWIndex(),
value + common_feature_value.EmbedG2SumIndex(),
false); // adam embed init not zero, adagrad embed init zero
_embedx_sgd_rule->InitValue(value + common_feature_value.EmbedxWIndex(), _embedx_sgd_rule->InitValue(value + common_feature_value.EmbedxWIndex(),
value + common_feature_value.EmbedxG2SumIndex(), value + common_feature_value.EmbedxG2SumIndex(),
false); false);
...@@ -293,22 +299,14 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) { ...@@ -293,22 +299,14 @@ std::string CtrDymfAccessor::ParseToString(const float* v, int param) {
i++) { i++) {
os << " " << v[i]; os << " " << v[i];
} }
// os << " " << common_feature_value.Slot(const_cast<float*>(v)) << " "
// << common_feature_value.MfDim(const_cast<float*>(v));
auto show = common_feature_value.Show(const_cast<float*>(v)); auto show = common_feature_value.Show(const_cast<float*>(v));
auto click = common_feature_value.Click(const_cast<float*>(v)); auto click = common_feature_value.Click(const_cast<float*>(v));
auto score = ShowClickScore(show, click); auto score = ShowClickScore(show, click);
auto mf_dim = int(common_feature_value.MfDim(const_cast<float*>(v)));
if (score >= _config.embedx_threshold() && if (score >= _config.embedx_threshold() &&
param > common_feature_value.EmbedxG2SumIndex()) { param > common_feature_value.EmbedxG2SumIndex()) {
// VLOG(1) << "common_feature_value.EmbedxG2SumIndex():"
// << common_feature_value.EmbedxG2SumIndex();
// VLOG(1) << "common_feature_value.EmbedxWIndex():"
// << common_feature_value.EmbedxWIndex();
// VLOG(1) << "common_feature_value.MfDim():"
// << common_feature_value.MfDim(const_cast<float*>(v));
for (auto i = common_feature_value.EmbedxG2SumIndex(); for (auto i = common_feature_value.EmbedxG2SumIndex();
i < common_feature_value.EmbedxWIndex() + i < common_feature_value.Dim(mf_dim);
common_feature_value.MfDim(const_cast<float*>(v));
++i) { ++i) {
os << " " << v[i]; os << " " << v[i];
} }
......
...@@ -54,10 +54,24 @@ class CtrDymfAccessor : public ValueAccessor { ...@@ -54,10 +54,24 @@ class CtrDymfAccessor : public ValueAccessor {
int ClickIndex() { return ShowIndex() + 1; } int ClickIndex() { return ShowIndex() + 1; }
int EmbedWIndex() { return ClickIndex() + 1; } int EmbedWIndex() { return ClickIndex() + 1; }
int EmbedG2SumIndex() { return EmbedWIndex() + 1; } int EmbedG2SumIndex() { return EmbedWIndex() + 1; }
int SlotIndex() { return EmbedG2SumIndex() + 1; } int SlotIndex() { return EmbedG2SumIndex() + embed_sgd_dim; }
int MfDimIndex() { return SlotIndex() + 1; } int MfDimIndex() { return SlotIndex() + 1; }
int EmbedxG2SumIndex() { return MfDimIndex() + 1; } int EmbedxG2SumIndex() { return MfDimIndex() + 1; }
int EmbedxWIndex() { return EmbedxG2SumIndex() + 1; } int EmbedxWIndex() { return EmbedxG2SumIndex() + embedx_sgd_dim; }
// 根据mf_dim计算的总长度
int Dim(int& mf_dim) {
int tmp_embedx_sgd_dim = 1;
if (optimizer_name == "SparseAdamSGDRule") { // adam
tmp_embedx_sgd_dim = mf_dim * 2 + 2;
} else if (optimizer_name == "SparseSharedAdamSGDRule") { // shared_adam
tmp_embedx_sgd_dim = 4;
}
return 7 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim;
}
// 根据mf_dim计算的总byte数
int Size(int& mf_dim) { return (Dim(mf_dim)) * sizeof(float); }
float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; } float& UnseenDays(float* val) { return val[UnseenDaysIndex()]; }
float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; } float& DeltaScore(float* val) { return val[DeltaScoreIndex()]; }
...@@ -73,6 +87,7 @@ class CtrDymfAccessor : public ValueAccessor { ...@@ -73,6 +87,7 @@ class CtrDymfAccessor : public ValueAccessor {
int embed_sgd_dim; int embed_sgd_dim;
int embedx_dim; int embedx_dim;
int embedx_sgd_dim; int embedx_sgd_dim;
std::string optimizer_name;
}; };
struct CtrDymfPushValue { struct CtrDymfPushValue {
......
...@@ -213,7 +213,6 @@ void SparseAdamSGDRule::UpdateValueWork(float* w, ...@@ -213,7 +213,6 @@ void SparseAdamSGDRule::UpdateValueWork(float* w,
float beta1_pow_ = *beta1_pow; float beta1_pow_ = *beta1_pow;
float beta2_pow_ = *beta2_pow; float beta2_pow_ = *beta2_pow;
// lr not change in one update
lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_); lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
for (size_t i = 0; i < _embedding_dim; i++) { for (size_t i = 0; i < _embedding_dim; i++) {
// Calculation // Calculation
...@@ -252,5 +251,88 @@ void SparseAdamSGDRule::InitValueWork(float* value, ...@@ -252,5 +251,88 @@ void SparseAdamSGDRule::InitValueWork(float* value,
*(sgd + Beta1PowIndex()) = _beta1_decay_rate; *(sgd + Beta1PowIndex()) = _beta1_decay_rate;
*(sgd + Beta2PowIndex()) = _beta2_decay_rate; *(sgd + Beta2PowIndex()) = _beta2_decay_rate;
} }
void SparseSharedAdamSGDRule::LoadConfig(
const SparseCommonSGDRuleParameter& param, size_t emb_dim) {
_embedding_dim = emb_dim;
auto adam_param = param.adam();
learning_rate_ = adam_param.learning_rate();
_initial_range = adam_param.initial_range();
_beta1_decay_rate = adam_param.beta1_decay_rate();
_beta2_decay_rate = adam_param.beta2_decay_rate();
_ada_epsilon = adam_param.ada_epsilon();
if (adam_param.weight_bounds_size() == 0) {
_min_bound = -std::numeric_limits<float>::max();
_max_bound = std::numeric_limits<float>::max();
} else {
CHECK(adam_param.weight_bounds_size() >= 2)
<< "invalid repeated size for weight_bounds:"
<< adam_param.weight_bounds_size();
_min_bound = adam_param.weight_bounds(0);
_max_bound = adam_param.weight_bounds(1);
}
}
void SparseSharedAdamSGDRule::UpdateValueWork(float* w,
float* sgd,
const float* grad,
float scale) {
float* gsum = sgd + GSumIndex();
float* g2sum = sgd + G2SumIndex();
float* beta1_pow = sgd + Beta1PowIndex();
float* beta2_pow = sgd + Beta2PowIndex();
const float* g = grad;
float lr = learning_rate_;
float beta1_pow_ = *beta1_pow;
float beta2_pow_ = *beta2_pow;
float gsum_ = *gsum;
float g2sum_ = *g2sum;
lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_);
double sum_gsum = 0.0;
double sum_g2sum = 0.0;
for (int i = 0; i < _embedding_dim; i++) {
// Calculation
double new_gsum =
_beta1_decay_rate * gsum_ + (1 - _beta1_decay_rate) * g[i];
double new_g2sum =
_beta2_decay_rate * g2sum_ + (1 - _beta2_decay_rate) * g[i] * g[i];
w[i] = w[i] - lr * (new_gsum / (sqrt(new_g2sum) + _ada_epsilon));
BoundValue(w[i]);
sum_gsum += new_gsum;
sum_g2sum += new_g2sum;
}
// update beta_pow_decay
(*gsum) = sum_gsum / _embedding_dim;
(*g2sum) = sum_g2sum / _embedding_dim;
(*beta1_pow) *= _beta1_decay_rate;
(*beta2_pow) *= _beta2_decay_rate;
}
void SparseSharedAdamSGDRule::InitValueWork(float* value,
float* sgd,
bool zero_init) {
for (int i = 0; i < _embedding_dim; ++i) {
if (zero_init) {
value[i] = 0.0;
BoundValue(value[i]);
} else {
value[i] =
(local_uniform_real_distribution<double>()(local_random_engine()) *
2 -
1) *
_initial_range;
BoundValue(value[i]);
}
}
// init rule gsum and g2sum
for (int i = GSumIndex(); i < Beta1PowIndex(); i++) {
sgd[i] = 0.0;
}
// init beta1_pow and beta2_pow
*(sgd + Beta1PowIndex()) = _beta1_decay_rate;
*(sgd + Beta2PowIndex()) = _beta2_decay_rate;
}
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -144,5 +144,28 @@ class SparseAdamSGDRule : public SparseValueSGDRule { ...@@ -144,5 +144,28 @@ class SparseAdamSGDRule : public SparseValueSGDRule {
float _beta2_decay_rate; float _beta2_decay_rate;
float _ada_epsilon; float _ada_epsilon;
}; };
class SparseSharedAdamSGDRule : public SparseValueSGDRule {
public:
virtual void LoadConfig(const SparseCommonSGDRuleParameter& param,
size_t emb_dim);
virtual void UpdateValueWork(float* w,
float* sgd,
const float* push_value,
float scale);
virtual void InitValueWork(float* value, float* sgd, bool zero_init);
virtual size_t Dim() { return 4; }
size_t GSumIndex() { return 0; }
size_t G2SumIndex() { return GSumIndex() + 1; }
size_t Beta1PowIndex() { return G2SumIndex() + 1; }
size_t Beta2PowIndex() { return Beta1PowIndex() + 1; }
protected:
float learning_rate_;
float _beta1_decay_rate;
float _beta2_decay_rate;
float _ada_epsilon;
};
} // namespace distributed } // namespace distributed
} // namespace paddle } // namespace paddle
...@@ -49,6 +49,7 @@ REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule); ...@@ -49,6 +49,7 @@ REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule); REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule);
REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseSharedAdamSGDRule);
int32_t TableManager::Initialize() { int32_t TableManager::Initialize() {
static bool initialized = false; static bool initialized = false;
......
...@@ -13,6 +13,7 @@ cc_library( ...@@ -13,6 +13,7 @@ cc_library(
op_registry op_registry
fs fs
shell shell
ps_gpu_wrapper
${RPC_DEPS}) ${RPC_DEPS})
target_link_libraries(fleet z) target_link_libraries(fleet z)
...@@ -18,6 +18,10 @@ limitations under the License. */ ...@@ -18,6 +18,10 @@ limitations under the License. */
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h" #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include "paddle/fluid/distributed/ps/table/table.h" #include "paddle/fluid/distributed/ps/table/table.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#endif
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
...@@ -129,6 +133,13 @@ void FleetWrapper::InitWorker(const std::string& dist_desc, ...@@ -129,6 +133,13 @@ void FleetWrapper::InitWorker(const std::string& dist_desc,
worker_ptr_ = std::shared_ptr<paddle::distributed::PSClient>( worker_ptr_ = std::shared_ptr<paddle::distributed::PSClient>(
paddle::distributed::PSClientFactory::Create(ps_param)); paddle::distributed::PSClientFactory::Create(ps_param));
worker_ptr_->Configure(ps_param, dense_pull_regions, ps_env_, index); worker_ptr_->Configure(ps_param, dense_pull_regions, ps_env_, index);
#if defined PADDLE_WITH_HETERPS && defined PADDLE_WITH_PSCORE
VLOG(3) << "FleetWrapper::InitWorker InitializeGPUServer";
auto* accessor = worker_ptr_->GetTableAccessor(0);
auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance();
ps_gpu_wrapper->InitializeGPUServer(ps_param);
ps_gpu_wrapper->SetTableAccessor(accessor);
#endif
} }
} else { } else {
VLOG(3) << "Client can be initialized only once"; VLOG(3) << "Client can be initialized only once";
...@@ -525,11 +536,11 @@ void FleetWrapper::PushSparseFromTensorAsync( ...@@ -525,11 +536,11 @@ void FleetWrapper::PushSparseFromTensorAsync(
int batch_size = -1; int batch_size = -1;
bool batch_size_consist = true; bool batch_size_consist = true;
for (auto* input : *inputs) { for (auto* input : *inputs) {
int cur_batch_size = size_t cur_batch_size =
input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0]; input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0];
if (batch_size == -1) { if (batch_size == -1) {
batch_size = cur_batch_size; batch_size = int(cur_batch_size);
} else if (batch_size != cur_batch_size) { } else if (batch_size != int(cur_batch_size)) {
// CHECK(batch_size == cur_batch_size); // NOLINT // CHECK(batch_size == cur_batch_size); // NOLINT
batch_size_consist = false; batch_size_consist = false;
break; break;
...@@ -537,12 +548,12 @@ void FleetWrapper::PushSparseFromTensorAsync( ...@@ -537,12 +548,12 @@ void FleetWrapper::PushSparseFromTensorAsync(
} }
CHECK(batch_size > 0); // NOLINT CHECK(batch_size > 0); // NOLINT
int show_size = size_t show_size =
shows->lod().size() ? shows->lod()[0].size() - 1 : shows->dims()[0]; shows->lod().size() ? shows->lod()[0].size() - 1 : shows->dims()[0];
CHECK(show_size == batch_size || show_size == 1); CHECK(show_size == size_t(batch_size) || show_size == 1);
int clk_size = size_t clk_size =
clks->lod().size() ? clks->lod()[0].size() - 1 : clks->dims()[0]; clks->lod().size() ? clks->lod()[0].size() - 1 : clks->dims()[0];
CHECK(clk_size == batch_size || clk_size == 1); CHECK(clk_size == size_t(batch_size) || clk_size == 1);
CHECK(outputs->size() == inputs->size()); CHECK(outputs->size() == inputs->size());
std::vector<uint64_t> push_keys; std::vector<uint64_t> push_keys;
...@@ -601,12 +612,10 @@ void FleetWrapper::PushSparseFromTensorAsync( ...@@ -601,12 +612,10 @@ void FleetWrapper::PushSparseFromTensorAsync(
// in // in
// ctr_accessor.h // ctr_accessor.h
push_values.back()[0] = 2; // TODO(zhaocaibei123): slot push_values.back()[0] = 2; // TODO(zhaocaibei123): slot
push_values.back()[1] = (static_cast<int>(i) >= show_size push_values.back()[1] =
? 1 (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
: static_cast<float>(show_tensor[i])); push_values.back()[2] =
push_values.back()[2] = (static_cast<int>(i) >= clk_size (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
? 0
: static_cast<float>(clk_tensor[i]));
float* data = push_values.back().data() + 3; float* data = push_values.back().data() + 3;
memcpy(data, g + output_len, sizeof(float) * fea_dim); memcpy(data, g + output_len, sizeof(float) * fea_dim);
} }
...@@ -630,12 +639,10 @@ void FleetWrapper::PushSparseFromTensorAsync( ...@@ -630,12 +639,10 @@ void FleetWrapper::PushSparseFromTensorAsync(
// slot show clk grad... consistent with CtrCommonPushValue defined in // slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h // ctr_accessor.h
push_values.back()[0] = 2; // TODO(zhaocaibei123): slot push_values.back()[0] = 2; // TODO(zhaocaibei123): slot
push_values.back()[1] = (static_cast<int>(i) >= show_size push_values.back()[1] =
? 1 (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
: static_cast<float>(show_tensor[i])); push_values.back()[2] =
push_values.back()[2] = (static_cast<int>(i) >= clk_size (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
? 0
: static_cast<float>(clk_tensor[i]));
float* data = push_values.back().data() + 3; float* data = push_values.back().data() + 3;
memcpy(data, g + output_len, sizeof(float) * fea_dim); memcpy(data, g + output_len, sizeof(float) * fea_dim);
} }
......
...@@ -197,14 +197,14 @@ message TableParameter { ...@@ -197,14 +197,14 @@ message TableParameter {
message TableAccessorParameter { message TableAccessorParameter {
optional string accessor_class = 1; optional string accessor_class = 1;
optional SGDParameter embed_sgd_param = 2;
optional SGDParameter embedx_sgd_param = 3;
optional uint32 fea_dim = 4 [ default = 11 ]; // field size of one value optional uint32 fea_dim = 4 [ default = 11 ]; // field size of one value
optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size optional uint32 embedx_dim = 5 [ default = 8 ]; // embedx feature size
optional uint32 embedx_threshold = 6 optional uint32 embedx_threshold = 6
[ default = 10 ]; // embedx feature create threshold [ default = 10 ]; // embedx feature create threshold
optional CtrAccessorParameter ctr_accessor_param = 7; optional CtrAccessorParameter ctr_accessor_param = 7;
repeated TableAccessorSaveParameter table_accessor_save_param = 8; repeated TableAccessorSaveParameter table_accessor_save_param = 8;
optional SGDParameter embed_sgd_param = 10;
optional SGDParameter embedx_sgd_param = 11;
} }
message SGDParameter { message SGDParameter {
...@@ -228,7 +228,7 @@ message ...@@ -228,7 +228,7 @@ message
repeated float weight_bounds = 4; repeated float weight_bounds = 4;
} }
message SparseAdamSGDParameter { // SparseAdamSGDRule message SparseAdamSGDParameter { // SparseAdamSGDRule | SparseSharedAdamSGDRule
optional double learning_rate = 1 [ default = 0.001 ]; optional double learning_rate = 1 [ default = 0.001 ];
optional double initial_range = 2 [ default = 0.0001 ]; optional double initial_range = 2 [ default = 0.0001 ];
optional double beta1_decay_rate = 3 [ default = 0.9 ]; optional double beta1_decay_rate = 3 [ default = 0.9 ];
......
...@@ -25,10 +25,17 @@ endif() ...@@ -25,10 +25,17 @@ endif()
if(WITH_HETERPS) if(WITH_HETERPS)
if(WITH_NCCL AND WITH_GPU) if(WITH_NCCL AND WITH_GPU)
nv_library( if(WITH_PSCORE)
ps_gpu_wrapper nv_library(
SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc ps_gpu_wrapper
DEPS heter_ps gloo_wrapper ${BRPC_DEPS}) SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
DEPS heter_ps gloo_wrapper ps_framework_proto ${BRPC_DEPS})
else()
nv_library(
ps_gpu_wrapper
SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
DEPS heter_ps gloo_wrapper ${BRPC_DEPS})
endif()
add_subdirectory(heter_ps) add_subdirectory(heter_ps)
elseif(WITH_XPU_KP) elseif(WITH_XPU_KP)
xpu_library( xpu_library(
......
...@@ -81,7 +81,6 @@ class HeterContext { ...@@ -81,7 +81,6 @@ class HeterContext {
std::vector<std::vector<FeatureValue>> device_values_; std::vector<std::vector<FeatureValue>> device_values_;
std::vector<std::vector<FeatureKey>> device_keys_; std::vector<std::vector<FeatureKey>> device_keys_;
std::vector<std::vector<std::vector<FeatureKey>>> device_dim_keys_; std::vector<std::vector<std::vector<FeatureKey>>> device_dim_keys_;
std::vector<std::vector<std::vector<FeatureValue>>> device_dim_values_;
std::vector<std::mutex*> mutex_; std::vector<std::mutex*> mutex_;
std::vector<std::vector<std::mutex*>> dim_mutex_; std::vector<std::vector<std::mutex*>> dim_mutex_;
int multi_mf_dim_ = 0; int multi_mf_dim_ = 0;
...@@ -114,7 +113,6 @@ class HeterContext { ...@@ -114,7 +113,6 @@ class HeterContext {
value_dim_ptr_[i].resize(dim_num); value_dim_ptr_[i].resize(dim_num);
} }
device_values_.resize(device_num); device_values_.resize(device_num);
device_dim_values_.resize(device_num);
device_keys_.resize(device_num); device_keys_.resize(device_num);
device_dim_keys_.resize(device_num); device_dim_keys_.resize(device_num);
......
...@@ -9,16 +9,16 @@ if(WITH_GPU) ...@@ -9,16 +9,16 @@ if(WITH_GPU)
endif() endif()
nv_library( nv_library(
heter_comm_kernel heter_comm_kernel
SRCS heter_comm_kernel.cu feature_value.h SRCS heter_comm_kernel.cu feature_value.h feature_value.cu
DEPS ${HETERPS_DEPS}) DEPS ${HETERPS_DEPS})
nv_library( nv_library(
hashtable_kernel hashtable_kernel
SRCS hashtable_kernel.cu feature_value.h SRCS hashtable_kernel.cu feature_value.h feature_value.cu
DEPS ${HETERPS_DEPS}) DEPS ${HETERPS_DEPS})
nv_library( nv_library(
heter_comm heter_comm
SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h SRCS heter_comm.h feature_value.h feature_value.cu heter_resource.cc
mem_pool.h heter_resource.h mem_pool.h
DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel) DEPS ${HETERPS_DEPS} heter_comm_kernel hashtable_kernel)
nv_test( nv_test(
test_heter_comm test_heter_comm
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_HETERPS
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
namespace paddle {
namespace framework {
template <typename FVAccessor>
__global__ void PullCopy(float** dest,
const float* src,
const int64_t* len,
int slot_num,
int total_len,
uint64_t** keys,
uint64_t max_val_size,
int* gpu_dim,
FVAccessor feature_value_accessor) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < len[mid])
high = mid;
else
low = mid + 1;
}
int x = low;
int y = i - (x ? len[x - 1] : 0);
float* feature_value_ptr =
(float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
int mf_dim = gpu_dim[x] - 3;
feature_value_accessor.Select(
dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
}
}
template <typename FVAccessor>
__global__ void PushCopyWithPool(float* dest,
float** src,
int64_t* len,
int slot_num,
uint64_t total_len,
int bs,
int* slot_vector,
int* mf_dim_vector,
size_t grad_value_size,
FVAccessor feature_value_accessor) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < len[mid])
high = mid;
else
low = mid + 1;
}
int x = low;
int y = i - (x ? len[low - 1] : 0);
float* cur = (float*)((char*)dest + i * grad_value_size);
cur[feature_value_accessor.common_push_value.SlotIndex()] =
(float)slot_vector[x];
int mf_dim = mf_dim_vector[x];
cur[feature_value_accessor.common_push_value.MfDimIndex()] = mf_dim;
cur[feature_value_accessor.common_push_value.ShowIndex()] =
*(src[x] + y * (mf_dim + 3));
cur[feature_value_accessor.common_push_value.ClickIndex()] =
*(src[x] + y * (mf_dim + 3) + 1);
cur[feature_value_accessor.common_push_value.EmbedGIndex()] =
*(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
for (int j = 0; j < mf_dim; j++) {
cur[feature_value_accessor.common_push_value.EmbedxGIndex() + j] =
*(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
}
}
}
template <typename GPUAccessor>
void AccessorWrapper<GPUAccessor>::CopyForPullImpl(
const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const float* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length,
int* gpu_dim,
int feature_value_size) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
cudaMemcpy(gpu_values,
values.data(),
values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
gpu_values,
total_values_gpu,
gpu_len,
slot_num,
total_length,
gpu_keys,
feature_value_size,
gpu_dim,
gpu_accessor_);
cudaStreamSynchronize(stream);
}
template <typename GPUAccessor>
void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
float* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const uint64_t total_length,
const int batch_size,
size_t grad_value_size,
std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector) {
auto stream = dynamic_cast<paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
auto slot_lengths_lod = slot_lengths;
for (int i = 1; i < slot_lengths_lod.size(); i++) {
slot_lengths_lod[i] += slot_lengths_lod[i - 1];
}
auto buf_grad_value =
memory::Alloc(place, grad_values.size() * sizeof(float*));
auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
auto buf_slot_vector =
memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
auto buf_mf_dim_vector =
memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
cudaMemcpy(gpu_values,
grad_values.data(),
grad_values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_len,
slot_lengths_lod.data(),
slot_lengths.size() * sizeof(int64_t),
cudaMemcpyHostToDevice);
cudaMemcpy(d_slot_vector,
slot_vector.data(),
slot_lengths_lod.size() * sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(d_mf_dim_vector,
slot_mf_dim_vector.data(),
slot_lengths_lod.size() * sizeof(int),
cudaMemcpyHostToDevice);
PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
total_grad_values_gpu,
gpu_values,
gpu_len,
slot_lengths.size(),
total_length,
batch_size,
d_slot_vector,
d_mf_dim_vector,
grad_value_size,
gpu_accessor_);
cudaStreamSynchronize(stream);
}
#ifdef PADDLE_WITH_PSCORE
template class AccessorWrapper<CommonFeatureValueAccessor>;
#endif
} // namespace framework
} // namespace paddle
#endif
...@@ -17,12 +17,547 @@ limitations under the License. */ ...@@ -17,12 +17,547 @@ limitations under the License. */
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
#include <iostream> #include <iostream>
#include <sstream>
#include <unordered_map>
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_PSCORE
#include "paddle/fluid/distributed/ps/table/accessor.h"
#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
#include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
#endif
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#define MF_DIM 8 #define MF_DIM 8
typedef uint64_t FeatureKey; typedef uint64_t FeatureKey;
#define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
class FeatureValueAccessor {
public:
__host__ __device__ FeatureValueAccessor() {}
__host__ __device__ ~FeatureValueAccessor() {}
__host__ __device__ virtual int Configure(
std::unordered_map<std::string, float> config) {
_config = config;
Initialize();
return 0;
}
__host__ __device__ virtual int Initialize() = 0;
protected:
std::unordered_map<std::string, float> _config;
};
// adagrad: embed_sgd_dim=1, embedx_sgd_dim=1,embedx_dim=n
// adam std: embed_sgd_dim=4, embedx_sgd_dim=n*2+2,embedx_dim=n
// adam shared: embed_sgd_dim=4, embedx_sgd_dim=4,embedx_dim=n
class CommonFeatureValueAccessor : public FeatureValueAccessor {
public:
struct CommonFeatureValue {
/*
uint64_t cpu_ptr;
float delta_score;
float show;
float click;
float embed_w;
std::vector<float> embed_g2sum;
float slot;
float mf_dim
float mf_size
std::vector<float> embedx_g2sum;
std::vector<float> embedx_w;
*/
__host__ __device__ int Dim() {
return 9 + embed_sgd_dim + embedx_sgd_dim + embedx_dim;
} // has cpu_ptr(2)
__host__ __device__ int DimSize(size_t dim, int embedx_dim) {
return sizeof(float);
}
__host__ __device__ size_t Size() {
return TYPEALIGN(8, Dim() * sizeof(float));
} // cpu_ptr:uint64=2float
__host__ __device__ int EmbedDim() { return embed_sgd_dim; }
__host__ __device__ int EmbedXDim() { return embedx_sgd_dim; }
__host__ __device__ int EmbedWDim() { return embedx_dim; }
__host__ __device__ int CpuPtrIndex() { return 0; } // cpuprt uint64
__host__ __device__ int DeltaScoreIndex() { return CpuPtrIndex() + 2; }
__host__ __device__ int ShowIndex() { return DeltaScoreIndex() + 1; }
__host__ __device__ int ClickIndex() { return ShowIndex() + 1; }
__host__ __device__ int EmbedWIndex() { return ClickIndex() + 1; }
__host__ __device__ int EmbedG2SumIndex() { return EmbedWIndex() + 1; }
__host__ __device__ int SlotIndex() {
return EmbedG2SumIndex() + embed_sgd_dim;
}
__host__ __device__ int MfDimIndex() { return SlotIndex() + 1; }
__host__ __device__ int MfSizeIndex() {
return MfDimIndex() + 1;
} // actual mf size (ex. 0)
__host__ __device__ int EmbedxG2SumIndex() { return MfSizeIndex() + 1; }
__host__ __device__ int EmbedxWIndex() {
return EmbedxG2SumIndex() + embedx_sgd_dim;
}
// 根据mf_dim计算的总长度
__host__ __device__ int Dim(int& mf_dim) {
int tmp_embedx_sgd_dim = 1;
if (optimizer_type_ == 3) { // adam
tmp_embedx_sgd_dim = mf_dim * 2 + 2;
} else if (optimizer_type_ == 4) { // shared_adam
tmp_embedx_sgd_dim = 4;
}
return 9 + embed_sgd_dim + tmp_embedx_sgd_dim + mf_dim;
}
// 根据mf_dim 计算的总byte数
__host__ __device__ size_t Size(int& mf_dim) {
return TYPEALIGN(8, Dim(mf_dim) * sizeof(float)); // cpu_ptr:2float
}
// 根据mf_dim 计算的 mf_size byte数
__host__ __device__ size_t MFSize(int& mf_dim) {
int tmp_embedx_sgd_dim = 1;
if (optimizer_type_ == 3) { // adam
tmp_embedx_sgd_dim = mf_dim * 2 + 2;
} else if (optimizer_type_ == 4) { // shared_adam
tmp_embedx_sgd_dim = 4;
}
return (tmp_embedx_sgd_dim + mf_dim) * sizeof(float);
}
__host__ __device__ int EmbedxG2SumOffsetIndex() { return 0; }
__host__ __device__ int EmbedxWOffsetIndex(float* val) {
// has mf
int tmp_embedx_sgd_dim = 1;
if (int(MfSize(val)) > 0) {
if (optimizer_type_ == 3) { // adam
tmp_embedx_sgd_dim = int(MfDim(val)) * 2 + 2;
} else if (optimizer_type_ == 4) { // shared_adam
tmp_embedx_sgd_dim = 4;
}
return EmbedxG2SumIndex() + tmp_embedx_sgd_dim;
} else {
// no mf
return 0;
}
}
__host__ __device__ uint64_t CpuPtr(float* val) {
return *(reinterpret_cast<uint64_t*>(val));
}
__host__ __device__ float& DeltaScore(float* val) {
return val[DeltaScoreIndex()];
}
__host__ __device__ float& Show(float* val) { return val[ShowIndex()]; }
__host__ __device__ float& Click(float* val) { return val[ClickIndex()]; }
__host__ __device__ float& Slot(float* val) { return val[SlotIndex()]; }
__host__ __device__ float& MfDim(float* val) { return val[MfDimIndex()]; }
__host__ __device__ float& MfSize(float* val) { return val[MfSizeIndex()]; }
__host__ __device__ float& EmbedW(float* val) { return val[EmbedWIndex()]; }
__host__ __device__ float& EmbedG2Sum(float* val) {
return val[EmbedG2SumIndex()];
}
__host__ __device__ float& EmbedxG2Sum(float* val) {
return val[EmbedxG2SumIndex()];
}
__host__ __device__ float& EmbedxW(float* val) {
return val[EmbedxWIndex()];
}
int embed_sgd_dim;
int embedx_dim;
int embedx_sgd_dim;
int optimizer_type_;
};
struct CommonPushValue {
/*
float slot;
float show;
float click;
float mf_dim;
float embed_g;
std::vector<float> embedx_g;
*/
__host__ __device__ int Dim(int embedx_dim) { return 5 + embedx_dim; }
__host__ __device__ int DimSize(int dim, int embedx_dim) {
return sizeof(float);
}
__host__ __device__ int Size(int embedx_dim) {
return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float));
}
__host__ __device__ int SlotIndex() { return 0; }
__host__ __device__ int ShowIndex() {
return CommonPushValue::SlotIndex() + 1;
}
__host__ __device__ int ClickIndex() {
return CommonPushValue::ShowIndex() + 1;
}
__host__ __device__ int MfDimIndex() {
return CommonPushValue::ClickIndex() + 1;
}
__host__ __device__ int EmbedGIndex() {
return CommonPushValue::MfDimIndex() + 1;
}
__host__ __device__ int EmbedxGIndex() {
return CommonPushValue::EmbedGIndex() + 1;
}
__host__ __device__ float& Slot(float* val) {
return val[CommonPushValue::SlotIndex()];
}
__host__ __device__ float& Show(float* val) {
return val[CommonPushValue::ShowIndex()];
}
__host__ __device__ float& Click(float* val) {
return val[CommonPushValue::ClickIndex()];
}
__host__ __device__ float& MfDim(float* val) {
return val[CommonPushValue::MfDimIndex()];
}
__host__ __device__ float& EmbedG(float* val) {
return val[CommonPushValue::EmbedGIndex()];
}
__host__ __device__ float* EmbedxG(float* val) {
return val + CommonPushValue::EmbedxGIndex();
}
};
struct CommonPullValue {
/*
float show;
float click;
float embed_w;
std::vector<float> embedx_w;
*/
__host__ __device__ static int Dim(int embedx_dim) {
return 3 + embedx_dim;
}
__host__ __device__ int DimSize(size_t dim) { return sizeof(float); }
__host__ __device__ int Size(int embedx_dim) {
return TYPEALIGN(8, Dim(embedx_dim) * sizeof(float));
}
__host__ __device__ int ShowIndex() { return 0; }
__host__ __device__ int ClickIndex() { return 1; }
__host__ __device__ int EmbedWIndex() { return 2; }
__host__ __device__ int EmbedxWIndex() { return 3; }
__host__ __device__ float& Show(float* val) {
return val[CommonPullValue::ShowIndex()];
}
__host__ __device__ float& Click(float* val) {
return val[CommonPullValue::ClickIndex()];
}
__host__ __device__ float& EmbedW(float* val) {
return val[CommonPullValue::EmbedWIndex()];
}
__host__ __device__ float* EmbedxW(float* val) {
return val + CommonPullValue::EmbedxWIndex();
}
};
__host__ __device__ CommonFeatureValueAccessor() {}
__host__ __device__ ~CommonFeatureValueAccessor() {}
__host__ __device__ virtual int Initialize() {
int optimizer_type = (_config.find("optimizer_type") == _config.end())
? 1
: int(_config["optimizer_type"]);
int sparse_embedx_dim = (_config.find("embedx_dim") == _config.end())
? 8
: int(_config["embedx_dim"]);
if (optimizer_type == 3) { // adam
common_feature_value.embed_sgd_dim = 4;
common_feature_value.embedx_sgd_dim = sparse_embedx_dim * 2 + 2;
} else if (optimizer_type == 4) { // shared_adam
common_feature_value.embed_sgd_dim = 4;
common_feature_value.embedx_sgd_dim = 4;
} else {
common_feature_value.embed_sgd_dim = 1;
common_feature_value.embedx_sgd_dim = 1;
}
common_feature_value.optimizer_type_ = optimizer_type;
common_feature_value.embedx_dim = sparse_embedx_dim;
return 0;
}
// // build阶段从cpu_val赋值给gpu_val
__host__ void BuildFill(
float* gpu_val,
void* cpu,
paddle::distributed::ValueAccessor* cpu_table_accessor,
int mf_dim) {
#ifdef PADDLE_WITH_PSCORE
paddle::distributed::CtrDymfAccessor* cpu_accessor =
dynamic_cast<paddle::distributed::CtrDymfAccessor*>(cpu_table_accessor);
paddle::distributed::FixedFeatureValue* cpu_ptr =
(paddle::distributed::FixedFeatureValue*)(cpu);
float* cpu_val = cpu_ptr->data();
size_t cpu_dim = cpu_ptr->size();
gpu_val[common_feature_value.DeltaScoreIndex()] =
cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()];
gpu_val[common_feature_value.ShowIndex()] =
cpu_val[cpu_accessor->common_feature_value.ShowIndex()];
gpu_val[common_feature_value.ClickIndex()] =
cpu_val[cpu_accessor->common_feature_value.ClickIndex()];
gpu_val[common_feature_value.SlotIndex()] =
cpu_val[cpu_accessor->common_feature_value.SlotIndex()];
gpu_val[common_feature_value.EmbedWIndex()] =
cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()];
for (int i = 0; i < common_feature_value.EmbedDim(); i++) {
gpu_val[common_feature_value.EmbedG2SumIndex() + i] =
cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i];
}
*(reinterpret_cast<uint64_t*>(
gpu_val + common_feature_value.CpuPtrIndex())) = (uint64_t)(cpu);
cpu_val[cpu_accessor->common_feature_value.MfDimIndex()] = float(mf_dim);
gpu_val[common_feature_value.MfDimIndex()] = mf_dim;
if (cpu_dim > cpu_accessor->GetAccessorInfo().dim -
cpu_accessor->GetAccessorInfo().mf_size / sizeof(float)) {
gpu_val[common_feature_value.MfSizeIndex()] =
common_feature_value.MFSize(mf_dim) / sizeof(float);
for (int x = 0;
x < int(common_feature_value.MFSize(mf_dim) / sizeof(float));
x++) {
gpu_val[common_feature_value.EmbedxG2SumIndex() + x] =
cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x];
}
} else {
gpu_val[common_feature_value.MfSizeIndex()] = 0;
for (int x = common_feature_value.EmbedxG2SumIndex();
x < int(common_feature_value.Size(mf_dim) / sizeof(float));
x++) {
gpu_val[x] = 0;
}
}
#endif
}
// dump_to_cpu阶段从gpu_val赋值给cpu_val
__host__ void DumpFill(float* gpu_val,
paddle::distributed::ValueAccessor* cpu_table_accessor,
int mf_dim) {
#ifdef PADDLE_WITH_PSCORE
paddle::distributed::CtrDymfAccessor* cpu_accessor =
dynamic_cast<paddle::distributed::CtrDymfAccessor*>(cpu_table_accessor);
auto* downpour_value =
(paddle::distributed::FixedFeatureValue*)(*(reinterpret_cast<uint64_t*>(
gpu_val + common_feature_value.CpuPtrIndex())));
size_t downpour_value_size = downpour_value->size();
if (gpu_val[common_feature_value.MfSizeIndex()] > 0 &&
downpour_value_size == (cpu_accessor->GetAccessorInfo().dim -
int(cpu_accessor->GetAccessorInfo().mf_size /
sizeof(float)))) { // cpu_accessor
downpour_value->resize(cpu_accessor->common_feature_value.Dim(mf_dim));
}
float* cpu_val = downpour_value->data();
cpu_val[cpu_accessor->common_feature_value.DeltaScoreIndex()] =
gpu_val[common_feature_value.DeltaScoreIndex()];
cpu_val[cpu_accessor->common_feature_value.ShowIndex()] =
gpu_val[common_feature_value.ShowIndex()];
cpu_val[cpu_accessor->common_feature_value.ClickIndex()] =
gpu_val[common_feature_value.ClickIndex()];
cpu_val[cpu_accessor->common_feature_value.EmbedWIndex()] =
gpu_val[common_feature_value.EmbedWIndex()];
cpu_val[cpu_accessor->common_feature_value.SlotIndex()] =
gpu_val[common_feature_value.SlotIndex()];
for (int i = 0; i < common_feature_value.EmbedDim(); i++) {
cpu_val[cpu_accessor->common_feature_value.EmbedG2SumIndex() + i] =
gpu_val[common_feature_value.EmbedG2SumIndex() + i];
}
if (gpu_val[common_feature_value.MfSizeIndex()] > 0) {
for (int x = 0;
x < int(common_feature_value.MFSize(mf_dim) / sizeof(float));
x++) {
cpu_val[cpu_accessor->common_feature_value.EmbedxG2SumIndex() + x] =
gpu_val[common_feature_value.EmbedxG2SumIndex() + x];
}
}
#endif
}
// dy_mf_fill_dvals_kernel, dy_mf_search_kernel 阶段 gpukernel
// 中从src_val赋值给dest_val
__host__ __device__ void FeatureValueFill(float* dest_val,
float* src_val,
int mf_dim) {
*(reinterpret_cast<uint64_t*>(dest_val +
common_feature_value.CpuPtrIndex())) =
*(reinterpret_cast<uint64_t*>(src_val +
common_feature_value.CpuPtrIndex()));
dest_val[common_feature_value.DeltaScoreIndex()] =
src_val[common_feature_value.DeltaScoreIndex()];
dest_val[common_feature_value.ShowIndex()] =
src_val[common_feature_value.ShowIndex()];
dest_val[common_feature_value.ClickIndex()] =
src_val[common_feature_value.ClickIndex()];
dest_val[common_feature_value.EmbedWIndex()] =
src_val[common_feature_value.EmbedWIndex()];
for (int i = 0; i < common_feature_value.EmbedDim(); i++) {
dest_val[common_feature_value.EmbedG2SumIndex() + i] =
src_val[common_feature_value.EmbedG2SumIndex() + i];
}
dest_val[common_feature_value.SlotIndex()] =
src_val[common_feature_value.SlotIndex()];
dest_val[common_feature_value.MfDimIndex()] = mf_dim;
dest_val[common_feature_value.MfSizeIndex()] =
src_val[common_feature_value.MfSizeIndex()];
for (int x = common_feature_value.EmbedxG2SumIndex();
x < int(common_feature_value.Size(mf_dim) / sizeof(float));
x++) {
dest_val[x] = src_val[x];
}
}
// dy_mf_fill_shard_grads_kernel,update_one 阶段 gpukernel
// 中从src_val赋值给dest_val
__host__ __device__ void PushValueFill(float* dest_val,
const float* src_val) {
dest_val[common_push_value.SlotIndex()] =
src_val[common_push_value.SlotIndex()];
dest_val[common_push_value.ShowIndex()] =
src_val[common_push_value.ShowIndex()];
dest_val[common_push_value.ClickIndex()] =
src_val[common_push_value.ClickIndex()];
dest_val[common_push_value.MfDimIndex()] =
src_val[common_push_value.MfDimIndex()];
dest_val[common_push_value.EmbedGIndex()] =
src_val[common_push_value.EmbedGIndex()];
for (int x = 0; x < int(src_val[common_push_value.MfDimIndex()]); x++) {
dest_val[common_push_value.EmbedxGIndex() + x] =
src_val[common_push_value.EmbedxGIndex() + x];
}
}
// update_basic 阶段 gpukernel 中从src_val赋值给dest_val
__host__ __device__ void PushValueFillBasic(float* dest_val,
const float* src_val) {
dest_val[common_push_value.SlotIndex()] =
src_val[common_push_value.SlotIndex()];
dest_val[common_push_value.ShowIndex()] =
src_val[common_push_value.ShowIndex()];
dest_val[common_push_value.ClickIndex()] =
src_val[common_push_value.ClickIndex()];
dest_val[common_push_value.MfDimIndex()] =
src_val[common_push_value.MfDimIndex()];
dest_val[common_push_value.EmbedGIndex()] =
src_val[common_push_value.EmbedGIndex()];
}
// merge_one 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val
__host__ __device__ void MergePushValue(float* dest_val,
const float* src_val) {
dest_val[common_push_value.ShowIndex()] +=
src_val[common_push_value.ShowIndex()];
dest_val[common_push_value.ClickIndex()] +=
src_val[common_push_value.ClickIndex()];
dest_val[common_push_value.EmbedGIndex()] +=
src_val[common_push_value.EmbedGIndex()];
for (int j = 0; j < int(dest_val[common_push_value.MfDimIndex()]); j++) {
dest_val[common_push_value.EmbedxGIndex() + j] +=
src_val[common_push_value.EmbedxGIndex() + j];
}
}
// merge_basic 阶段 gpukernel 中 PushValue 从src_val赋值给dest_val
__host__ __device__ void MergePushValueBasic(float* dest_val,
const float* src_val) {
dest_val[common_push_value.ShowIndex()] +=
src_val[common_push_value.ShowIndex()];
dest_val[common_push_value.ClickIndex()] +=
src_val[common_push_value.ClickIndex()];
dest_val[common_push_value.EmbedGIndex()] +=
src_val[common_push_value.EmbedGIndex()];
}
// PullCopy 阶段 gpukernel 中 FeatureValue回填到PullValue
__host__ __device__ void Select(float* dest_val,
float* src_val,
uint64_t* key,
int mf_dim) {
if (*key == 0) {
*(dest_val + common_pull_value.ShowIndex()) = 0;
*(dest_val + common_pull_value.ClickIndex()) = 0;
*(dest_val + common_pull_value.EmbedWIndex()) = 0;
} else {
*(dest_val + common_pull_value.ShowIndex()) =
src_val[common_feature_value.ShowIndex()];
*(dest_val + common_pull_value.ClickIndex()) =
src_val[common_feature_value.ClickIndex()];
*(dest_val + common_pull_value.EmbedWIndex()) =
src_val[common_feature_value.EmbedWIndex()];
}
if (src_val[common_feature_value.MfSizeIndex()] == 0 || *key == 0) {
for (int j = 0; j < mf_dim; j++) {
*(dest_val + common_pull_value.EmbedxWIndex() + j) = 0;
}
} else {
for (int j = 0; j < mf_dim; j++) {
*(dest_val + common_pull_value.EmbedxWIndex() + j) =
src_val[common_feature_value.EmbedxWOffsetIndex(src_val) + j];
}
}
}
__host__ __device__ std::string ParseToString(const float* v,
int param_size) {
/*
uint64_t cpu_ptr; // 2float
float delta_score;
float show;
float click;
float embed_w;
std::vector<float> embed_g2sum;
float slot;
float mf_dim
float mf_size
std::vector<float> embedx_g2sum;
std::vector<float> embedx_w;
*/
std::stringstream os;
os << "cpuptr: " << common_feature_value.CpuPtr(const_cast<float*>(v))
<< " delta_score: " << v[2] << " show: " << v[3] << " click: " << v[4]
<< " embed_w:" << v[5] << " embed_g2sum:";
for (int i = common_feature_value.EmbedG2SumIndex();
i < common_feature_value.SlotIndex();
i++) {
os << " " << v[i];
}
int mf_dim = int(common_feature_value.MfDim(const_cast<float*>(v)));
os << " slot: " << common_feature_value.Slot(const_cast<float*>(v))
<< " mf_dim: " << mf_dim
<< " mf_size: " << common_feature_value.MfSize(const_cast<float*>(v))
<< " mf: ";
if (param_size > common_feature_value.EmbedxG2SumIndex()) {
for (auto i = common_feature_value.EmbedxG2SumIndex();
i < common_feature_value.Dim(mf_dim);
++i) {
os << " " << v[i];
}
}
return os.str();
}
public:
CommonFeatureValue common_feature_value;
CommonPushValue common_push_value;
CommonPullValue common_pull_value;
};
struct FeatureValue { struct FeatureValue {
float delta_score; float delta_score;
...@@ -95,6 +630,176 @@ struct FeaturePushValue { ...@@ -95,6 +630,176 @@ struct FeaturePushValue {
} }
}; };
class VirtualAccessor {
public:
virtual int Configure(std::unordered_map<std::string, float> config) = 0;
virtual size_t GetFeatureValueSize(int& mf_dim) = 0;
virtual size_t GetPushValueSize(int& mf_dim) = 0;
virtual void BuildFill(void* gpu_val,
void* cpu_val,
paddle::distributed::ValueAccessor* cpu_table_accessor,
int mf_dim) = 0;
virtual void DumpFill(float* gpu_val,
paddle::distributed::ValueAccessor* cpu_table_accessor,
int mf_dim) = 0;
virtual void CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const float* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length,
int* gpu_dim,
int feature_value_size) = 0;
virtual void CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
float* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const uint64_t total_length,
const int batch_size,
size_t grad_value_size,
std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector) = 0;
virtual std::string ParseToString(const float* v, int param_size) = 0;
};
template <typename GPUAccessor>
class AccessorWrapper : public VirtualAccessor {
public:
explicit AccessorWrapper() {}
virtual ~AccessorWrapper() {}
AccessorWrapper(const AccessorWrapper&) = delete;
AccessorWrapper& operator=(const AccessorWrapper&) = delete;
virtual int Configure(std::unordered_map<std::string, float> config) {
return gpu_accessor_.Configure(config);
}
virtual size_t GetFeatureValueSize(int& mf_dim) {
return gpu_accessor_.common_feature_value.Size(mf_dim);
}
virtual size_t GetPushValueSize(int& mf_dim) {
return gpu_accessor_.common_push_value.Size(mf_dim);
}
virtual void BuildFill(void* gpu_val,
void* cpu_val,
paddle::distributed::ValueAccessor* cpu_table_accessor,
int mf_dim) {
gpu_accessor_.BuildFill(
(float*)(gpu_val), cpu_val, cpu_table_accessor, mf_dim);
}
virtual void DumpFill(float* gpu_val,
paddle::distributed::ValueAccessor* cpu_table_accessor,
int mf_dim) {
gpu_accessor_.DumpFill(gpu_val, cpu_table_accessor, mf_dim);
}
virtual void CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const float* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length,
int* gpu_dim,
int feature_value_size) {
CopyForPullImpl(place,
gpu_keys,
values,
total_values_gpu,
gpu_len,
slot_num,
hidden_size,
total_length,
gpu_dim,
feature_value_size);
}
virtual void CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
float* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const uint64_t total_length,
const int batch_size,
size_t grad_value_size,
std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector) {
CopyForPushImpl(place,
grad_values,
total_grad_values_gpu,
slot_lengths,
total_length,
batch_size,
grad_value_size,
slot_vector,
slot_mf_dim_vector);
}
void CopyForPullImpl(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const float* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length,
int* gpu_dim,
int feature_value_size);
void CopyForPushImpl(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
float* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const uint64_t total_length,
const int batch_size,
size_t grad_value_size,
std::vector<int>& slot_vector,
std::vector<int>& slot_mf_dim_vector);
virtual std::string ParseToString(const float* v, int param_size) {
return gpu_accessor_.ParseToString(v, param_size);
}
GPUAccessor gpu_accessor_;
};
class GlobalAccessorTransfor {
public:
static GlobalAccessorTransfor& GetInstance() {
static GlobalAccessorTransfor ins;
return ins;
}
void Init(std::string accessor_type) {
if (accessor_wrapper_ptr_ != nullptr) {
return;
}
if (accessor_type == "CtrDymfAccessor") {
accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>();
} else {
VLOG(0) << "GlobalAccessorTransfor Init not support accessor_type:"
<< accessor_type;
accessor_wrapper_ptr_ = new AccessorWrapper<CommonFeatureValueAccessor>();
}
}
VirtualAccessor* GetAccessorWrapper() { return accessor_wrapper_ptr_; }
private:
VirtualAccessor* accessor_wrapper_ptr_ = nullptr;
};
} // end namespace framework } // end namespace framework
} // end namespace paddle } // end namespace paddle
#endif #endif
...@@ -25,10 +25,12 @@ ...@@ -25,10 +25,12 @@
#ifdef PADDLE_WITH_HETERPS #ifdef PADDLE_WITH_HETERPS
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class GpuPsGraphTable : public HeterComm<uint64_t, int64_t, int> { class GpuPsGraphTable
: public HeterComm<uint64_t, int64_t, int, CommonFeatureValueAccessor> {
public: public:
GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware) GpuPsGraphTable(std::shared_ptr<HeterPsResource> resource, int topo_aware)
: HeterComm<uint64_t, int64_t, int>(1, resource) { : HeterComm<uint64_t, int64_t, int, CommonFeatureValueAccessor>(
1, resource) {
load_factor_ = 0.25; load_factor_ = 0.25;
rw_lock.reset(new pthread_rwlock_t()); rw_lock.reset(new pthread_rwlock_t());
gpu_num = resource_->total_device(); gpu_num = resource_->total_device();
......
...@@ -137,8 +137,12 @@ class HashTable { ...@@ -137,8 +137,12 @@ class HashTable {
size_t len, size_t len,
StreamType stream); StreamType stream);
template <typename StreamType> template <typename StreamType, typename FVAccessor>
void get(const KeyType* d_keys, char* d_vals, size_t len, StreamType stream); void get(const KeyType* d_keys,
char* d_vals,
size_t len,
StreamType stream,
FVAccessor& fv_accessor);
void show(); void show();
...@@ -150,9 +154,9 @@ class HashTable { ...@@ -150,9 +154,9 @@ class HashTable {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
template <typename GradType, typename Sgd, typename StreamType> template <typename Sgd, typename StreamType>
void update(const KeyType* d_keys, void update(const KeyType* d_keys,
const GradType* d_grads, const float* d_grads,
size_t len, size_t len,
Sgd sgd, Sgd sgd,
StreamType stream); StreamType stream);
......
...@@ -83,36 +83,25 @@ __global__ void search_kernel(Table* table, ...@@ -83,36 +83,25 @@ __global__ void search_kernel(Table* table,
} }
} }
template <typename Table> template <typename Table, typename FVAccessor>
__global__ void dy_mf_search_kernel(Table* table, __global__ void dy_mf_search_kernel(Table* table,
const typename Table::key_type* const keys, const typename Table::key_type* const keys,
char* vals, char* vals,
size_t len, size_t len,
size_t pull_feature_value_size) { size_t pull_feature_value_size,
FVAccessor feature_value_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) { if (i < len) {
auto it = table->find(keys[i]); auto it = table->find(keys[i]);
if (it != table->end()) { if (it != table->end()) {
uint64_t offset = i * pull_feature_value_size; uint64_t offset = i * pull_feature_value_size;
FeatureValue* cur = (FeatureValue*)(vals + offset); float* cur = (float*)(vals + offset);
FeatureValue& input = *(FeatureValue*)(it->second); float* input = it->second;
cur->slot = input.slot; int mf_dim =
cur->show = input.show; int(input[feature_value_accessor.common_feature_value.MfDimIndex()]);
cur->clk = input.clk;
cur->mf_dim = input.mf_dim; feature_value_accessor.FeatureValueFill(cur, input, mf_dim);
cur->lr = input.lr;
cur->mf_size = input.mf_size;
cur->cpu_ptr = input.cpu_ptr;
cur->delta_score = input.delta_score;
cur->lr_g2sum = input.lr_g2sum;
for (int j = 0; j < cur->mf_dim + 1; ++j) {
cur->mf[j] = input.mf[j];
}
} else {
if (keys[i] != 0) {
printf("warning::pull miss key: %llu", keys[i]);
}
} }
} }
} }
...@@ -145,8 +134,8 @@ __global__ void dy_mf_update_kernel(Table* table, ...@@ -145,8 +134,8 @@ __global__ void dy_mf_update_kernel(Table* table,
if (i < len) { if (i < len) {
auto it = table->find(keys[i]); auto it = table->find(keys[i]);
if (it != table->end()) { if (it != table->end()) {
FeaturePushValue* cur = (FeaturePushValue*)(grads + i * grad_value_size); float* cur = (float*)(grads + i * grad_value_size);
sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, *cur); sgd.dy_mf_update_value(optimizer_config, (it.getter())->second, cur);
} else { } else {
if (keys[i] != 0) { if (keys[i] != 0) {
printf("warning::push miss key: %llu", keys[i]); printf("warning::push miss key: %llu", keys[i]);
...@@ -212,17 +201,18 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys, ...@@ -212,17 +201,18 @@ void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
} }
template <typename KeyType, typename ValType> template <typename KeyType, typename ValType>
template <typename StreamType> template <typename StreamType, typename FVAccessor>
void HashTable<KeyType, ValType>::get(const KeyType* d_keys, void HashTable<KeyType, ValType>::get(const KeyType* d_keys,
char* d_vals, char* d_vals,
size_t len, size_t len,
StreamType stream) { StreamType stream,
FVAccessor& fv_accessor) {
if (len == 0) { if (len == 0) {
return; return;
} }
const int grid_size = (len - 1) / BLOCK_SIZE_ + 1; const int grid_size = (len - 1) / BLOCK_SIZE_ + 1;
dy_mf_search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>( dy_mf_search_kernel<<<grid_size, BLOCK_SIZE_, 0, stream>>>(
container_, d_keys, d_vals, len, pull_feature_value_size_); container_, d_keys, d_vals, len, pull_feature_value_size_, fv_accessor);
} }
template <typename KeyType, typename ValType> template <typename KeyType, typename ValType>
...@@ -298,27 +288,6 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) { ...@@ -298,27 +288,6 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
cpu_val[x + 7] = gpu_val.mf[x]; cpu_val[x + 7] = gpu_val.mf[x];
} }
} }
#endif
#ifdef PADDLE_WITH_PSCORE
auto* downpour_value =
(paddle::distributed::FixedFeatureValue*)(gpu_val.cpu_ptr);
int downpour_value_size = downpour_value->size();
if (gpu_val.mf_size > 0 && downpour_value_size == 7) {
downpour_value->resize(gpu_val.mf_size + downpour_value_size);
}
float* cpu_val = downpour_value->data();
// cpu_val[0] = 0;
cpu_val[2] = gpu_val.delta_score;
cpu_val[3] = gpu_val.show;
cpu_val[4] = gpu_val.clk;
cpu_val[5] = gpu_val.lr;
cpu_val[6] = gpu_val.lr_g2sum;
cpu_val[0] = gpu_val.slot;
if (gpu_val.mf_size > 0) {
for (int x = 0; x < gpu_val.mf_size; x++) {
cpu_val[x + 7] = gpu_val.mf[x];
}
}
#endif #endif
} }
}; };
...@@ -336,9 +305,9 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) { ...@@ -336,9 +305,9 @@ void HashTable<KeyType, ValType>::dump_to_cpu(int devid, StreamType stream) {
} }
template <typename KeyType, typename ValType> template <typename KeyType, typename ValType>
template <typename GradType, typename Sgd, typename StreamType> template <typename Sgd, typename StreamType>
void HashTable<KeyType, ValType>::update(const KeyType* d_keys, void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
const GradType* d_grads, const float* d_grads,
size_t len, size_t len,
Sgd sgd, Sgd sgd,
StreamType stream) { StreamType stream) {
...@@ -371,8 +340,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys, ...@@ -371,8 +340,8 @@ void HashTable<KeyType, ValType>::update(const KeyType* d_keys,
push_grad_value_size_); push_grad_value_size_);
} }
template class HashTable<unsigned long, paddle::framework::FeatureValue>; template class HashTable<unsigned long, float>;
template class HashTable<unsigned long, paddle::framework::FeatureValue*>; template class HashTable<unsigned long, float*>;
template class HashTable<long, int>; template class HashTable<long, int>;
template class HashTable<unsigned long, int>; template class HashTable<unsigned long, int>;
template class HashTable<unsigned long, unsigned long>; template class HashTable<unsigned long, unsigned long>;
...@@ -382,15 +351,19 @@ template class HashTable<long, long>; ...@@ -382,15 +351,19 @@ template class HashTable<long, long>;
template class HashTable<long, unsigned long>; template class HashTable<long, unsigned long>;
template class HashTable<long, unsigned int>; template class HashTable<long, unsigned int>;
template void HashTable<unsigned long, paddle::framework::FeatureValue>::get< template void HashTable<unsigned long, float>::get<cudaStream_t>(
cudaStream_t>(const unsigned long* d_keys, const unsigned long* d_keys,
paddle::framework::FeatureValue* d_vals, float* d_vals,
size_t len, size_t len,
cudaStream_t stream); cudaStream_t stream);
template void template void
HashTable<unsigned long, paddle::framework::FeatureValue*>::get<cudaStream_t>( HashTable<unsigned long, float*>::get<cudaStream_t, CommonFeatureValueAccessor>(
const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t stream); const unsigned long* d_keys,
char* d_vals,
size_t len,
cudaStream_t stream,
CommonFeatureValueAccessor& fv_accessor);
template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys, template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
int* d_vals, int* d_vals,
...@@ -399,6 +372,12 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys, ...@@ -399,6 +372,12 @@ template void HashTable<long, int>::get<cudaStream_t>(const long* d_keys,
template void HashTable<unsigned long, int>::get<cudaStream_t>( template void HashTable<unsigned long, int>::get<cudaStream_t>(
const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream); const unsigned long* d_keys, int* d_vals, size_t len, cudaStream_t stream);
template void HashTable<unsigned long, unsigned long>::get<cudaStream_t>(
const unsigned long* d_keys,
unsigned long* d_vals,
size_t len,
cudaStream_t stream);
template void HashTable<long, unsigned long>::get<cudaStream_t>( template void HashTable<long, unsigned long>::get<cudaStream_t>(
const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream); const long* d_keys, unsigned long* d_vals, size_t len, cudaStream_t stream);
template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys, template void HashTable<long, long>::get<cudaStream_t>(const long* d_keys,
...@@ -414,19 +393,19 @@ template void HashTable<unsigned long, long>::get<cudaStream_t>( ...@@ -414,19 +393,19 @@ template void HashTable<unsigned long, long>::get<cudaStream_t>(
// const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t // const unsigned long* d_keys, char* d_vals, size_t len, cudaStream_t
// stream); // stream);
template void HashTable<unsigned long, paddle::framework::FeatureValue>::insert< template void HashTable<unsigned long, float>::insert<cudaStream_t>(
cudaStream_t>(const unsigned long* d_keys, const unsigned long* d_keys,
const paddle::framework::FeatureValue* d_vals, const float* d_vals,
size_t len, size_t len,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<unsigned long, paddle::framework::FeatureValue*>:: template void HashTable<unsigned long, float*>::insert<cudaStream_t>(
insert<cudaStream_t>(const unsigned long* d_keys, const unsigned long* d_keys,
size_t len, size_t len,
char* pool, char* pool,
size_t feature_value_size, size_t feature_value_size,
size_t start_index, size_t start_index,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys, template void HashTable<long, int>::insert<cudaStream_t>(const long* d_keys,
const int* d_vals, const int* d_vals,
...@@ -460,30 +439,37 @@ template void HashTable<unsigned long, long>::insert<cudaStream_t>( ...@@ -460,30 +439,37 @@ template void HashTable<unsigned long, long>::insert<cudaStream_t>(
size_t len, size_t len,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<unsigned long, paddle::framework::FeatureValue>:: template void HashTable<unsigned long, unsigned long>::insert<cudaStream_t>(
dump_to_cpu<cudaStream_t>(int devid, cudaStream_t stream); const unsigned long* d_keys,
const unsigned long* d_vals,
size_t len,
cudaStream_t stream);
template void HashTable<unsigned long, paddle::framework::FeatureValue>::update< template void HashTable<unsigned long, float*>::dump_to_cpu<cudaStream_t>(
paddle::framework::FeaturePushValue, int devid, cudaStream_t stream);
Optimizer<paddle::framework::FeatureValue,
paddle::framework::FeaturePushValue>, template void
HashTable<unsigned long, float*>::update<SparseAdagradOptimizer, cudaStream_t>(
const unsigned long* d_keys,
const char* d_grads,
size_t len,
SparseAdagradOptimizer sgd,
cudaStream_t stream);
template void
HashTable<unsigned long, float*>::update<SparseAdamOptimizer, cudaStream_t>(
const unsigned long* d_keys,
const char* d_grads,
size_t len,
SparseAdamOptimizer sgd,
cudaStream_t stream);
template void HashTable<unsigned long, float*>::update<
SparseAdamSharedOptimizer,
cudaStream_t>(const unsigned long* d_keys, cudaStream_t>(const unsigned long* d_keys,
const paddle::framework::FeaturePushValue* d_grads, const char* d_grads,
size_t len, size_t len,
Optimizer<paddle::framework::FeatureValue, SparseAdamSharedOptimizer sgd,
paddle::framework::FeaturePushValue> sgd,
cudaStream_t stream); cudaStream_t stream);
template void HashTable<unsigned long, paddle::framework::FeatureValue*>::
update<Optimizer<paddle::framework::FeatureValue,
paddle::framework::FeaturePushValue>,
cudaStream_t>(const unsigned long* d_keys,
const char* d_grads,
size_t len,
Optimizer<paddle::framework::FeatureValue,
paddle::framework::FeaturePushValue> sgd,
cudaStream_t stream);
// template void HashTable<unsigned long, // template void HashTable<unsigned long,
// paddle::framework::FeatureValue>::update< // paddle::framework::FeatureValue>::update<
// Optimizer<paddle::framework::FeatureValue, // Optimizer<paddle::framework::FeatureValue,
......
...@@ -46,7 +46,10 @@ namespace framework { ...@@ -46,7 +46,10 @@ namespace framework {
#define TYPEALIGN(ALIGNVAL, LEN) \ #define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1))) (((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
typename ValType,
typename GradType,
typename FVAccessor>
class HeterComm { class HeterComm {
public: public:
HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource); HeterComm(size_t capacity, std::shared_ptr<HeterPsResource> resource);
...@@ -65,12 +68,9 @@ class HeterComm { ...@@ -65,12 +68,9 @@ class HeterComm {
GradType* d_grads, GradType* d_grads,
size_t len, size_t len,
int& uniq_len); // NOLINT int& uniq_len); // NOLINT
void dynamic_merge_grad(int gpu_num, void dynamic_merge_grad(
KeyType* d_keys, int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len);
GradType* d_grads, void pull_sparse(int num, KeyType* d_keys, float* d_vals, size_t len);
size_t len,
int& uniq_len);
void pull_sparse(int num, KeyType* d_keys, ValType* d_vals, size_t len);
void build_ps(int num, void build_ps(int num,
KeyType* h_keys, KeyType* h_keys,
ValType* h_vals, ValType* h_vals,
...@@ -92,7 +92,7 @@ class HeterComm { ...@@ -92,7 +92,7 @@ class HeterComm {
template <typename Sgd> template <typename Sgd>
void push_sparse(int num, void push_sparse(int num,
KeyType* d_keys, KeyType* d_keys,
GradType* d_grads, float* d_grads,
size_t len, size_t len,
Sgd& sgd); // NOLINT Sgd& sgd); // NOLINT
#elif defined(PADDLE_WITH_XPU_KP) #elif defined(PADDLE_WITH_XPU_KP)
...@@ -149,6 +149,13 @@ class HeterComm { ...@@ -149,6 +149,13 @@ class HeterComm {
multi_mf_dim_ = multi_mf_dim; multi_mf_dim_ = multi_mf_dim;
max_mf_dim_ = max_mf_dim; max_mf_dim_ = max_mf_dim;
} }
void set_accessor(FVAccessor& accessor) {
feature_value_accessor_ = accessor;
// for (auto& ptr_table: ptr_tables_) {
// ptr_table->set_accessor(feature_value_accessor_);
// }
}
#endif #endif
bool need_transfer(int send_id, int receive_id) { bool need_transfer(int send_id, int receive_id) {
...@@ -282,9 +289,11 @@ class HeterComm { ...@@ -282,9 +289,11 @@ class HeterComm {
char* src_val, char* src_val,
size_t val_size); size_t val_size);
FVAccessor feature_value_accessor_;
protected: protected:
using Table = HashTable<KeyType, ValType>; using Table = HashTable<KeyType, ValType>;
using PtrTable = HashTable<KeyType, ValType*>; using PtrTable = HashTable<KeyType, float*>;
std::vector<Table*> tables_; std::vector<Table*> tables_;
std::vector<PtrTable*> ptr_tables_; std::vector<PtrTable*> ptr_tables_;
std::shared_ptr<HeterPsResource> resource_; std::shared_ptr<HeterPsResource> resource_;
......
...@@ -24,8 +24,12 @@ limitations under the License. */ ...@@ -24,8 +24,12 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename KeyType, typename ValType, typename GradType>
HeterComm<KeyType, ValType, GradType>::HeterComm( template <typename KeyType,
typename ValType,
typename GradType,
typename FVAccessor>
HeterComm<KeyType, ValType, GradType, FVAccessor>::HeterComm(
size_t capacity, std::shared_ptr<HeterPsResource> resource) { size_t capacity, std::shared_ptr<HeterPsResource> resource) {
VLOG(1) << "Construct new HeterComm"; VLOG(1) << "Construct new HeterComm";
resource_ = resource; resource_ = resource;
...@@ -42,10 +46,14 @@ HeterComm<KeyType, ValType, GradType>::HeterComm( ...@@ -42,10 +46,14 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
tables_.push_back(table); tables_.push_back(table);
} else { } else {
max_mf_dim_ = resource_->max_mf_dim(); max_mf_dim_ = resource_->max_mf_dim();
size_t val_type_size = TYPEALIGN( auto accessor_wrapper_ptr =
8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
size_t grad_type_size = TYPEALIGN( size_t val_type_size =
8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
size_t grad_type_size =
accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
VLOG(0) << " HeterComm init, max feature_value_size:" << val_type_size
<< ", feature_value_push_size:" << grad_type_size;
auto ptr_table = new PtrTable(capacity / load_factor_); auto ptr_table = new PtrTable(capacity / load_factor_);
ptr_table->set_feature_value_size(val_type_size, grad_type_size); ptr_table->set_feature_value_size(val_type_size, grad_type_size);
ptr_tables_.push_back(ptr_table); ptr_tables_.push_back(ptr_table);
...@@ -58,8 +66,11 @@ HeterComm<KeyType, ValType, GradType>::HeterComm( ...@@ -58,8 +66,11 @@ HeterComm<KeyType, ValType, GradType>::HeterComm(
init_path(); init_path();
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::init_path() { typename ValType,
typename GradType,
typename FVAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::init_path() {
int total_device = resource_->total_device(); int total_device = resource_->total_device();
path_.resize(total_device); path_.resize(total_device);
if (!topo_aware_) { if (!topo_aware_) {
...@@ -111,14 +122,18 @@ void HeterComm<KeyType, ValType, GradType>::init_path() { ...@@ -111,14 +122,18 @@ void HeterComm<KeyType, ValType, GradType>::init_path() {
} }
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
typename ValType,
typename GradType,
typename FVAccessor>
template <typename DstPlace, typename SrcPlace, typename StreamType> template <typename DstPlace, typename SrcPlace, typename StreamType>
void HeterComm<KeyType, ValType, GradType>::memory_copy(DstPlace dst_place, void HeterComm<KeyType, ValType, GradType, FVAccessor>::memory_copy(
void* dst, DstPlace dst_place,
SrcPlace src_place, void* dst,
const void* src, SrcPlace src_place,
size_t count, const void* src,
StreamType stream) { size_t count,
StreamType stream) {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream); cudaMemcpyAsync(dst, src, count, cudaMemcpyDefault, stream);
if (stream == 0) { if (stream == 0) {
...@@ -129,11 +144,12 @@ void HeterComm<KeyType, ValType, GradType>::memory_copy(DstPlace dst_place, ...@@ -129,11 +144,12 @@ void HeterComm<KeyType, ValType, GradType>::memory_copy(DstPlace dst_place,
#endif #endif
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index, typename ValType,
int end_index, typename GradType,
int keylen, typename FVAccessor>
int vallen) { void HeterComm<KeyType, ValType, GradType, FVAccessor>::create_storage(
int start_index, int end_index, int keylen, int vallen) {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
auto& allocator = allocators_[start_index]; auto& allocator = allocators_[start_index];
auto& nodes = path_[start_index][end_index].nodes_; auto& nodes = path_[start_index][end_index].nodes_;
...@@ -167,9 +183,12 @@ void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index, ...@@ -167,9 +183,12 @@ void HeterComm<KeyType, ValType, GradType>::create_storage(int start_index,
#endif #endif
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index, typename ValType,
int end_index) { typename GradType,
typename FVAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::destroy_storage(
int start_index, int end_index) {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
auto& allocator = allocators_[start_index]; auto& allocator = allocators_[start_index];
auto& nodes = path_[start_index][end_index].nodes_; auto& nodes = path_[start_index][end_index].nodes_;
...@@ -184,13 +203,17 @@ void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index, ...@@ -184,13 +203,17 @@ void HeterComm<KeyType, ValType, GradType>::destroy_storage(int start_index,
#endif #endif
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index, typename ValType,
int num, typename GradType,
int* h_left, typename FVAccessor>
int* h_right, void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
KeyType* src_key, int start_index,
GradType* src_val) { int num,
int* h_left,
int* h_right,
KeyType* src_key,
GradType* src_val) {
int need_copy_val = 0; int need_copy_val = 0;
if (src_val) { if (src_val) {
need_copy_val = 1; need_copy_val = 1;
...@@ -267,14 +290,18 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index, ...@@ -267,14 +290,18 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
} }
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index, typename ValType,
int gpu_num, typename GradType,
int* h_left, typename FVAccessor>
int* h_right, void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_dest(
KeyType* src_key, int start_index,
char* src_val, int gpu_num,
size_t val_size) { int* h_left,
int* h_right,
KeyType* src_key,
char* src_val,
size_t val_size) {
int need_copy_val = 0; int need_copy_val = 0;
if (src_val) { if (src_val) {
need_copy_val = 1; need_copy_val = 1;
...@@ -327,13 +354,17 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index, ...@@ -327,13 +354,17 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_dest(int start_index,
} }
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::walk_to_src(int start_index, typename ValType,
int gpu_num, typename GradType,
int* h_left, typename FVAccessor>
int* h_right, void HeterComm<KeyType, ValType, GradType, FVAccessor>::walk_to_src(
char* src_val, int start_index,
size_t val_size) { int gpu_num,
int* h_left,
int* h_right,
char* src_val,
size_t val_size) {
std::queue<CopyTask> que; std::queue<CopyTask> que;
for (int i = 0; i < gpu_num; i++) { for (int i = 0; i < gpu_num; i++) {
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
...@@ -383,8 +414,11 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_src(int start_index, ...@@ -383,8 +414,11 @@ void HeterComm<KeyType, ValType, GradType>::walk_to_src(int start_index,
} }
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
HeterComm<KeyType, ValType, GradType>::~HeterComm() { typename ValType,
typename GradType,
typename FVAccessor>
HeterComm<KeyType, ValType, GradType, FVAccessor>::~HeterComm() {
if (!multi_mf_dim_) { if (!multi_mf_dim_) {
for (auto& table : tables_) { for (auto& table : tables_) {
delete table; delete table;
...@@ -402,15 +436,22 @@ HeterComm<KeyType, ValType, GradType>::~HeterComm() { ...@@ -402,15 +436,22 @@ HeterComm<KeyType, ValType, GradType>::~HeterComm() {
} }
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::show_one_table(int gpu_num) { typename ValType,
typename GradType,
typename FVAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::show_one_table(
int gpu_num) {
if (!multi_mf_dim_) { if (!multi_mf_dim_) {
tables_[gpu_num]->show(); tables_[gpu_num]->show();
} }
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
int HeterComm<KeyType, ValType, GradType>::log2i(int x) { typename ValType,
typename GradType,
typename FVAccessor>
int HeterComm<KeyType, ValType, GradType, FVAccessor>::log2i(int x) {
unsigned res = 0; unsigned res = 0;
while (x >>= 1) { while (x >>= 1) {
++res; ++res;
...@@ -418,13 +459,20 @@ int HeterComm<KeyType, ValType, GradType>::log2i(int x) { ...@@ -418,13 +459,20 @@ int HeterComm<KeyType, ValType, GradType>::log2i(int x) {
return res; return res;
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) { typename ValType,
typename GradType,
typename FVAccessor>
int HeterComm<KeyType, ValType, GradType, FVAccessor>::get_index_by_devid(
int devid) {
return resource_->get_index_by_devid(devid); return resource_->get_index_by_devid(devid);
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd( typename ValType,
typename GradType,
typename FVAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_sparse_sgd(
const OptimizerConfig& optimizer_config) { const OptimizerConfig& optimizer_config) {
for (int i = 0; i < resource_->total_device(); ++i) { for (int i = 0; i < resource_->total_device(); ++i) {
AnyDeviceGuard guard(resource_->dev_id(i)); AnyDeviceGuard guard(resource_->dev_id(i));
...@@ -436,8 +484,11 @@ void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd( ...@@ -436,8 +484,11 @@ void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
} }
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd( typename ValType,
typename GradType,
typename FVAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::set_embedx_sgd(
const OptimizerConfig& optimizer_config) { const OptimizerConfig& optimizer_config) {
for (int i = 0; i < resource_->total_device(); ++i) { for (int i = 0; i < resource_->total_device(); ++i) {
AnyDeviceGuard guard(resource_->dev_id(i)); AnyDeviceGuard guard(resource_->dev_id(i));
...@@ -449,13 +500,17 @@ void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd( ...@@ -449,13 +500,17 @@ void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
} }
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::build_ps(int dev_num, typename ValType,
KeyType* h_keys, typename GradType,
ValType* h_vals, typename FVAccessor>
size_t len, void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
size_t chunk_size, int dev_num,
int stream_num) { KeyType* h_keys,
ValType* h_vals,
size_t len,
size_t chunk_size,
int stream_num) {
if (len <= 0) { if (len <= 0) {
return; return;
} }
...@@ -518,14 +573,18 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int dev_num, ...@@ -518,14 +573,18 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int dev_num,
} }
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::build_ps(int num, typename ValType,
KeyType* h_keys, typename GradType,
char* pool, typename FVAccessor>
size_t len, void HeterComm<KeyType, ValType, GradType, FVAccessor>::build_ps(
size_t feature_value_size, int num,
size_t chunk_size, KeyType* h_keys,
int stream_num) { char* pool,
size_t len,
size_t feature_value_size,
size_t chunk_size,
int stream_num) {
if (len <= 0) { if (len <= 0) {
return; return;
} }
...@@ -580,8 +639,11 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num, ...@@ -580,8 +639,11 @@ void HeterComm<KeyType, ValType, GradType>::build_ps(int num,
} }
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::merge_grad( typename ValType,
typename GradType,
typename FVAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::merge_grad(
int dev_num, int dev_num,
KeyType* d_keys, KeyType* d_keys,
GradType* d_grads, GradType* d_grads,
...@@ -654,13 +716,12 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad( ...@@ -654,13 +716,12 @@ void HeterComm<KeyType, ValType, GradType>::merge_grad(
sync_stream(stream); sync_stream(stream);
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad( typename ValType,
int gpu_num, typename GradType,
KeyType* d_keys, typename FVAccessor>
GradType* d_grads, void HeterComm<KeyType, ValType, GradType, FVAccessor>::dynamic_merge_grad(
size_t len, int gpu_num, KeyType* d_keys, float* d_grads, size_t len, int& uniq_len) {
int& uniq_len) {
int dev_id = resource_->dev_id(gpu_num); int dev_id = resource_->dev_id(gpu_num);
platform::CUDAPlace place = platform::CUDAPlace(dev_id); platform::CUDAPlace place = platform::CUDAPlace(dev_id);
platform::CUDADeviceGuard guard(dev_id); platform::CUDADeviceGuard guard(dev_id);
...@@ -668,16 +729,15 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad( ...@@ -668,16 +729,15 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
size_t temp_storage_bytes; size_t temp_storage_bytes;
// VLOG(1) << "hetercomm merge_grad: max_mf_dim: " << max_mf_dim_; auto accessor_wrapper_ptr =
size_t grad_value_size = GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType)); auto d_merge_keys = memory::Alloc(place, len * sizeof(KeyType));
KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr()); KeyType* d_merge_keys_ptr = reinterpret_cast<KeyType*>(d_merge_keys->ptr());
auto d_merge_grads = memory::Alloc(place, len * grad_value_size); auto d_merge_grads = memory::Alloc(place, len * grad_value_size);
GradType* d_merge_grads_ptr = float* d_merge_grads_ptr = reinterpret_cast<float*>(d_merge_grads->ptr());
reinterpret_cast<GradType*>(d_merge_grads->ptr());
auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1)); auto d_fea_num_info = memory::Alloc(place, sizeof(uint32_t) * (len * 3 + 1));
uint32_t* d_fea_num_info_ptr = uint32_t* d_fea_num_info_ptr =
...@@ -772,7 +832,8 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad( ...@@ -772,7 +832,8 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
uniq_len, uniq_len,
grad_value_size, grad_value_size,
merger_, merger_,
stream); stream,
feature_value_accessor_);
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads, PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(d_grads,
d_merge_grads_ptr, d_merge_grads_ptr,
...@@ -782,8 +843,11 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad( ...@@ -782,8 +843,11 @@ void HeterComm<KeyType, ValType, GradType>::dynamic_merge_grad(
PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream)); PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamSynchronize(stream));
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::split_input_to_shard( typename ValType,
typename GradType,
typename FVAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::split_input_to_shard(
KeyType* d_keys, KeyType* d_keys,
int* d_idx_ptr, int* d_idx_ptr,
size_t len, size_t len,
...@@ -843,11 +907,12 @@ void HeterComm<KeyType, ValType, GradType>::split_input_to_shard( ...@@ -843,11 +907,12 @@ void HeterComm<KeyType, ValType, GradType>::split_input_to_shard(
sync_stream(stream); sync_stream(stream);
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num, typename ValType,
KeyType* d_keys, typename GradType,
ValType* d_vals, typename FVAccessor>
size_t len) { void HeterComm<KeyType, ValType, GradType, FVAccessor>::pull_sparse(
int num, KeyType* d_keys, float* d_vals, size_t len) {
if (len == 0) { if (len == 0) {
return; return;
} }
...@@ -893,12 +958,15 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num, ...@@ -893,12 +958,15 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
auto d_idx = memory::Alloc(place, len * sizeof(int)); auto d_idx = memory::Alloc(place, len * sizeof(int));
int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr()); int* d_idx_ptr = reinterpret_cast<int*>(d_idx->ptr());
size_t val_type_size =
TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
size_t val_type_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
VLOG(3) << "pull_sparse len:" << len << " val_type_size: " << val_type_size;
auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType)); auto d_shard_keys = memory::Alloc(place, len * sizeof(KeyType));
KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr()); KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
auto d_shard_vals = memory::Alloc(place, len * val_type_size); auto d_shard_vals = memory::Alloc(place, len * val_type_size);
ValType* d_shard_vals_ptr = reinterpret_cast<ValType*>(d_shard_vals->ptr()); float* d_shard_vals_ptr = reinterpret_cast<float*>(d_shard_vals->ptr());
split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num); split_input_to_shard(d_keys, d_idx_ptr, len, d_left_ptr, d_right_ptr, num);
...@@ -944,7 +1012,8 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num, ...@@ -944,7 +1012,8 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage), ptr_tables_[i]->get(reinterpret_cast<KeyType*>(node.key_storage),
node.val_storage, node.val_storage,
h_right[i] - h_left[i] + 1, h_right[i] - h_left[i] + 1,
resource_->remote_stream(i, num)); resource_->remote_stream(i, num),
feature_value_accessor_);
} }
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
...@@ -964,10 +1033,16 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num, ...@@ -964,10 +1033,16 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
auto& node = path_[num][i].nodes_.front(); auto& node = path_[num][i].nodes_.front();
sync_stream(node.out_stream); sync_stream(node.out_stream);
} }
heter_comm_kernel_->dy_mf_fill_dvals( heter_comm_kernel_->dy_mf_fill_dvals(d_shard_vals_ptr,
d_shard_vals_ptr, d_vals, d_idx_ptr, len, val_type_size, stream); d_vals,
d_idx_ptr,
len,
val_type_size,
stream,
feature_value_accessor_);
sync_stream(stream); sync_stream(stream);
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
continue; continue;
...@@ -977,13 +1052,17 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num, ...@@ -977,13 +1052,17 @@ void HeterComm<KeyType, ValType, GradType>::pull_sparse(int num,
} }
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
typename ValType,
typename GradType,
typename FVAccessor>
template <typename Sgd> template <typename Sgd>
void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num, void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
KeyType* d_keys, int dev_num,
GradType* d_grads, KeyType* d_keys,
size_t len, float* d_grads,
Sgd& sgd) { // NOLINT size_t len,
Sgd& sgd) { // NOLINT
if (len == 0) { if (len == 0) {
return; return;
} }
...@@ -991,8 +1070,9 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num, ...@@ -991,8 +1070,9 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
int total_device = resource_->total_device(); int total_device = resource_->total_device();
int dev_id = resource_->dev_id(dev_num); int dev_id = resource_->dev_id(dev_num);
size_t grad_value_size = auto accessor_wrapper_ptr =
TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
DevPlace place = DevPlace(dev_id); DevPlace place = DevPlace(dev_id);
AnyDeviceGuard guard(dev_id); AnyDeviceGuard guard(dev_id);
auto stream = resource_->local_stream(dev_num, 0); auto stream = resource_->local_stream(dev_num, 0);
...@@ -1037,8 +1117,7 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num, ...@@ -1037,8 +1117,7 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr()); KeyType* d_shard_keys_ptr = reinterpret_cast<KeyType*>(d_shard_keys->ptr());
auto d_shard_grads = memory::Alloc(place, len * grad_value_size); auto d_shard_grads = memory::Alloc(place, len * grad_value_size);
GradType* d_shard_grads_ptr = float* d_shard_grads_ptr = reinterpret_cast<float*>(d_shard_grads->ptr());
reinterpret_cast<GradType*>(d_shard_grads->ptr());
int uniq_len = len; int uniq_len = len;
dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len); dynamic_merge_grad(dev_num, d_keys, d_grads, len, uniq_len);
...@@ -1048,24 +1127,15 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num, ...@@ -1048,24 +1127,15 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
split_input_to_shard( split_input_to_shard(
d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num); d_keys, d_idx_ptr, uniq_len, d_left_ptr, d_right_ptr, dev_num);
if (!multi_mf_dim_) { heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr,
heter_comm_kernel_->fill_shard_grads(d_shard_keys_ptr, d_keys,
d_keys, d_shard_grads_ptr,
d_shard_grads_ptr, d_grads,
d_grads, d_idx_ptr,
d_idx_ptr, uniq_len,
uniq_len, grad_value_size,
stream); stream,
} else { feature_value_accessor_);
heter_comm_kernel_->dy_mf_fill_shard_grads(d_shard_keys_ptr,
d_keys,
d_shard_grads_ptr,
d_grads,
d_idx_ptr,
uniq_len,
grad_value_size,
stream);
}
sync_stream(stream); sync_stream(stream);
...@@ -1089,33 +1159,17 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num, ...@@ -1089,33 +1159,17 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
continue; continue;
} }
if (!multi_mf_dim_) { create_storage(
create_storage(dev_num, dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size);
i,
shard_len * sizeof(KeyType),
shard_len * sizeof(GradType));
} else {
create_storage(
dev_num, i, shard_len * sizeof(KeyType), shard_len * grad_value_size);
}
} }
if (!multi_mf_dim_) { walk_to_dest(dev_num,
walk_to_dest(dev_num, total_device,
total_device, h_left,
h_left, h_right,
h_right, d_shard_keys_ptr,
d_shard_keys_ptr, reinterpret_cast<char*>(d_shard_grads_ptr),
d_shard_grads_ptr); grad_value_size);
} else {
walk_to_dest(dev_num,
total_device,
h_left,
h_right,
d_shard_keys_ptr,
reinterpret_cast<char*>(d_shard_grads_ptr),
grad_value_size);
}
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
if (h_left[i] == -1 || h_right[i] == -1) { if (h_left[i] == -1 || h_right[i] == -1) {
...@@ -1125,21 +1179,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num, ...@@ -1125,21 +1179,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
sync_stream(node.in_stream); sync_stream(node.in_stream);
AnyDeviceGuard guard(resource_->dev_id(i)); AnyDeviceGuard guard(resource_->dev_id(i));
if (!multi_mf_dim_) { ptr_tables_[i]->rwlock_->WRLock();
tables_[i]->rwlock_->WRLock(); ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage), node.val_storage,
reinterpret_cast<GradType*>(node.val_storage), h_right[i] - h_left[i] + 1,
h_right[i] - h_left[i] + 1, sgd,
sgd, resource_->remote_stream(i, dev_num));
resource_->remote_stream(i, dev_num));
} else {
ptr_tables_[i]->rwlock_->WRLock();
ptr_tables_[i]->update(reinterpret_cast<KeyType*>(node.key_storage),
node.val_storage,
h_right[i] - h_left[i] + 1,
sgd,
resource_->remote_stream(i, dev_num));
}
} }
for (int i = 0; i < total_device; ++i) { for (int i = 0; i < total_device; ++i) {
...@@ -1162,11 +1207,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num, ...@@ -1162,11 +1207,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
} }
#elif defined(PADDLE_WITH_XPU_KP) #elif defined(PADDLE_WITH_XPU_KP)
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num, typename ValType,
KeyType* d_keys, typename GradType,
GradType* d_grads, typename FVAccessor>
size_t len) { void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse(
int dev_num, KeyType* d_keys, GradType* d_grads, size_t len) {
if (len == 0) { if (len == 0) {
return; return;
} }
...@@ -1302,9 +1348,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num, ...@@ -1302,9 +1348,12 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse(int dev_num,
#endif #endif
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
typename ValType,
typename GradType,
typename FVAccessor>
template <typename Sgd> template <typename Sgd>
void HeterComm<KeyType, ValType, GradType>::update_one_table( void HeterComm<KeyType, ValType, GradType, FVAccessor>::update_one_table(
int gpu_num, int gpu_num,
KeyType* d_keys, KeyType* d_keys,
GradType* d_grads, GradType* d_grads,
...@@ -1323,9 +1372,12 @@ void HeterComm<KeyType, ValType, GradType>::update_one_table( ...@@ -1323,9 +1372,12 @@ void HeterComm<KeyType, ValType, GradType>::update_one_table(
cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num)); cudaStreamSynchronize(resource_->remote_stream(gpu_num, gpu_num));
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
typename ValType,
typename GradType,
typename FVAccessor>
template <typename Sgd> template <typename Sgd>
void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node( void HeterComm<KeyType, ValType, GradType, FVAccessor>::push_sparse_multi_node(
int gpu_num, int gpu_num,
KeyType* d_keys, KeyType* d_keys,
GradType* d_grads, GradType* d_grads,
...@@ -1352,8 +1404,11 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node( ...@@ -1352,8 +1404,11 @@ void HeterComm<KeyType, ValType, GradType>::push_sparse_multi_node(
sgd); sgd);
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad( typename ValType,
typename GradType,
typename FVAccessor>
int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_one_node_grad(
int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
int total_gpu = resource_->total_device(); int total_gpu = resource_->total_device();
int dev_id = resource_->dev_id(gpu_num); int dev_id = resource_->dev_id(gpu_num);
...@@ -1454,8 +1509,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad( ...@@ -1454,8 +1509,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_one_node_grad(
return ret; return ret;
} }
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad( typename ValType,
typename GradType,
typename FVAccessor>
int HeterComm<KeyType, ValType, GradType, FVAccessor>::gather_multi_node_grad(
int gpu_num, KeyType* d_keys, GradType* d_grads, int len) { int gpu_num, KeyType* d_keys, GradType* d_grads, int len) {
int dev_id = resource_->dev_id(gpu_num); int dev_id = resource_->dev_id(gpu_num);
auto& storage = storage_[gpu_num]; auto& storage = storage_[gpu_num];
...@@ -1525,8 +1583,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad( ...@@ -1525,8 +1583,11 @@ int HeterComm<KeyType, ValType, GradType>::gather_multi_node_grad(
} }
#endif #endif
template <typename KeyType, typename ValType, typename GradType> template <typename KeyType,
void HeterComm<KeyType, ValType, GradType>::end_pass() { typename ValType,
typename GradType,
typename FVAccessor>
void HeterComm<KeyType, ValType, GradType, FVAccessor>::end_pass() {
int total_device = resource_->total_device(); int total_device = resource_->total_device();
std::vector<std::thread> threads; std::vector<std::thread> threads;
...@@ -1547,8 +1608,10 @@ void HeterComm<KeyType, ValType, GradType>::end_pass() { ...@@ -1547,8 +1608,10 @@ void HeterComm<KeyType, ValType, GradType>::end_pass() {
} }
} }
// template <typename KeyType, typename ValType, typename GradType> // template <typename KeyType, typename ValType, typename GradType, typename
// void HeterComm<KeyType, ValType, GradType>::dump_to_cpu(int index) { // FVAccessor>
// void HeterComm<KeyType, ValType, GradType, FVAccessor>::dump_to_cpu(int
// index) {
// auto stream = resource_->local_stream(index, 0); // auto stream = resource_->local_stream(index, 0);
// int dev_id = resource_->dev_id(index); // int dev_id = resource_->dev_id(index);
// platform::CUDADeviceGuard guard(dev_id); // platform::CUDADeviceGuard guard(dev_id);
......
...@@ -128,22 +128,28 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals, ...@@ -128,22 +128,28 @@ __global__ void fill_dvals_kernel(ValType* d_shard_vals,
} }
} }
template <typename KeyType, typename GradType, typename T> template <typename KeyType, typename T, typename FVAccessor>
__global__ void dy_mf_fill_shard_grads_kernel(KeyType* d_shard_keys, __global__ void dy_mf_fill_shard_grads_kernel(
KeyType* d_keys, KeyType* d_shard_keys,
GradType* d_shard_grads, KeyType* d_keys,
GradType* d_grads, float* d_shard_grads,
T* idx, float* d_grads,
size_t len, T* idx,
size_t grad_value_size) { size_t len,
size_t grad_value_size,
FVAccessor feature_value_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) { if (i < len) {
d_shard_keys[i] = d_keys[idx[i]]; d_shard_keys[i] = d_keys[idx[i]];
*(GradType*)((char*)d_shard_grads + i * grad_value_size) = float* cur = (float*)((char*)d_shard_grads + i * grad_value_size);
*(GradType*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size); float* shard_val =
(float*)((char*)d_grads + uint64_t(idx[i]) * grad_value_size);
feature_value_accessor.PushValueFill(cur, shard_val);
} }
} }
template <typename FVAccessor>
__global__ void merge_gradients_kernel(const uint32_t* offset, __global__ void merge_gradients_kernel(const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
...@@ -151,36 +157,40 @@ __global__ void merge_gradients_kernel(const uint32_t* offset, ...@@ -151,36 +157,40 @@ __global__ void merge_gradients_kernel(const uint32_t* offset,
char* output, char* output,
int n, int n,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_) { DynamicGradMerger& merger,
FVAccessor& feature_value_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) { if (i < n) {
uint32_t start = offset[i]; uint32_t start = offset[i];
uint32_t num = fea_num[i]; uint32_t num = fea_num[i];
int ori_index = index[start]; int ori_index = index[start];
FeaturePushValue& out = *(FeaturePushValue*)(output + i * grad_value_size); float* out = (float*)(output + i * grad_value_size);
FeaturePushValue& in = float* in = (float*)(input + size_t(ori_index) * grad_value_size);
*(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); merger.update_one(out, in, feature_value_accessor);
merger_.update_one(out, in);
for (int j = 1; j < num; ++j) { for (int j = 1; j < num; ++j) {
ori_index = index[start + j]; ori_index = index[start + j];
FeaturePushValue& rhs = in = (float*)(input + size_t(ori_index) * grad_value_size);
*(FeaturePushValue*)(input + size_t(ori_index) * grad_value_size); merger.merge_one(out, in, feature_value_accessor);
merger_.merge_one(out, rhs);
} }
} }
} }
template <typename ValType, typename T> template <typename T, typename FVAccessor>
__global__ void dy_mf_fill_dvals_kernel(ValType* d_shard_vals, __global__ void dy_mf_fill_dvals_kernel(float* d_shard_vals,
ValType* d_vals, float* d_vals,
T* idx, T* idx,
size_t len, size_t len,
size_t val_size) { size_t val_size,
FVAccessor feature_value_accessor) {
const size_t i = blockIdx.x * blockDim.x + threadIdx.x; const size_t i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < len) { if (i < len) {
uint64_t new_offset = uint64_t(idx[i]) * val_size; uint64_t new_offset = uint64_t(idx[i]) * val_size;
*(ValType*)((char*)d_vals + new_offset) = float* cur = (float*)((char*)d_vals + new_offset);
*(ValType*)((char*)d_shard_vals + i * val_size); float* shard_val = (float*)((char*)d_shard_vals + uint64_t(i) * val_size);
int mf_dim = int(
shard_val[feature_value_accessor.common_feature_value.MfDimIndex()]);
feature_value_accessor.FeatureValueFill(cur, shard_val, mf_dim);
} }
} }
...@@ -312,15 +322,20 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage, ...@@ -312,15 +322,20 @@ void HeterCommKernel::reduce_by_key(void* d_temp_storage,
debug_synchronous)); debug_synchronous));
} }
template <typename KeyType, typename GradType, typename T, typename StreamType> template <typename KeyType,
void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys, typename T,
KeyType* d_keys, typename StreamType,
GradType* d_shard_grads, typename FVAccessor>
GradType* d_grads, void HeterCommKernel::dy_mf_fill_shard_grads(
T* idx, KeyType* d_shard_keys,
long long len, KeyType* d_keys,
size_t grad_value_size, float* d_shard_grads,
const StreamType& stream) { float* d_grads,
T* idx,
long long len,
size_t grad_value_size,
const StreamType& stream,
FVAccessor& feature_value_accessor) {
int grid_size = (len - 1) / block_size_ + 1; int grid_size = (len - 1) / block_size_ + 1;
size_t c_len = (size_t)len; size_t c_len = (size_t)len;
dy_mf_fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>( dy_mf_fill_shard_grads_kernel<<<grid_size, block_size_, 0, stream>>>(
...@@ -330,10 +345,11 @@ void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys, ...@@ -330,10 +345,11 @@ void HeterCommKernel::dy_mf_fill_shard_grads(KeyType* d_shard_keys,
d_grads, d_grads,
idx, idx,
c_len, c_len,
grad_value_size); grad_value_size,
feature_value_accessor);
} }
template <typename StreamType> template <typename StreamType, typename FVAccessor>
void HeterCommKernel::merge_gradient(const uint32_t* offset, void HeterCommKernel::merge_gradient(const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
...@@ -342,23 +358,33 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset, ...@@ -342,23 +358,33 @@ void HeterCommKernel::merge_gradient(const uint32_t* offset,
int n, int n,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_, DynamicGradMerger& merger_,
const StreamType& stream) { const StreamType& stream,
FVAccessor& feature_value_accessor) {
int grid_size = (n - 1) / block_size_ + 1; int grid_size = (n - 1) / block_size_ + 1;
merge_gradients_kernel<<<grid_size, block_size_, 0, stream>>>( merge_gradients_kernel<<<grid_size, block_size_, 0, stream>>>(
offset, fea_num, index, input, output, n, grad_value_size, merger_); offset,
fea_num,
index,
input,
output,
n,
grad_value_size,
merger_,
feature_value_accessor);
} }
template <typename ValType, typename T, typename StreamType> template <typename T, typename StreamType, typename FVAccessor>
void HeterCommKernel::dy_mf_fill_dvals(ValType* d_shard_vals, void HeterCommKernel::dy_mf_fill_dvals(float* d_shard_vals,
ValType* d_vals, float* d_vals,
T* idx, T* idx,
long long len, long long len,
size_t val_size, size_t val_size,
const StreamType& stream) { const StreamType& stream,
FVAccessor& feature_value_accessor) {
int grid_size = (len - 1) / block_size_ + 1; int grid_size = (len - 1) / block_size_ + 1;
size_t c_len = (size_t)len; size_t c_len = (size_t)len;
dy_mf_fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>( dy_mf_fill_dvals_kernel<<<grid_size, block_size_, 0, stream>>>(
d_shard_vals, d_vals, idx, c_len, val_size); d_shard_vals, d_vals, idx, c_len, val_size, feature_value_accessor);
} }
template void HeterCommKernel::fill_idx<int, cudaStream_t>( template void HeterCommKernel::fill_idx<int, cudaStream_t>(
...@@ -402,17 +428,15 @@ template void HeterCommKernel::fill_shard_key<unsigned long, int, cudaStream_t>( ...@@ -402,17 +428,15 @@ template void HeterCommKernel::fill_shard_key<unsigned long, int, cudaStream_t>(
long long len, long long len,
const cudaStream_t& stream); const cudaStream_t& stream);
template void HeterCommKernel::fill_shard_grads< template void
unsigned long, HeterCommKernel::fill_shard_grads<unsigned long, float, int, cudaStream_t>(
paddle::framework::FeaturePushValue, unsigned long* d_shard_keys,
int, unsigned long* d_keys,
cudaStream_t>(unsigned long* d_shard_keys, float* d_shard_grads,
unsigned long* d_keys, float* d_grads,
paddle::framework::FeaturePushValue* d_shard_grads, int* idx,
paddle::framework::FeaturePushValue* d_grads, long long len,
int* idx, const cudaStream_t& stream);
long long len,
const cudaStream_t& stream);
template void template void
HeterCommKernel::fill_dvals<paddle::framework::FeatureValue, int, cudaStream_t>( HeterCommKernel::fill_dvals<paddle::framework::FeatureValue, int, cudaStream_t>(
...@@ -467,20 +491,23 @@ template void HeterCommKernel::reduce_by_key< ...@@ -467,20 +491,23 @@ template void HeterCommKernel::reduce_by_key<
cudaStream_t stream, cudaStream_t stream,
bool debug_synchronous); bool debug_synchronous);
template void HeterCommKernel::dy_mf_fill_shard_grads< template void
unsigned long, HeterCommKernel::dy_mf_fill_shard_grads<unsigned long,
paddle::framework::FeaturePushValue, int,
int, cudaStream_t,
cudaStream_t>(unsigned long* d_shard_keys, CommonFeatureValueAccessor>(
unsigned long* d_keys, unsigned long* d_shard_keys,
paddle::framework::FeaturePushValue* d_shard_grads, unsigned long* d_keys,
paddle::framework::FeaturePushValue* d_grads, float* d_shard_grads,
int* idx, float* d_grads,
long long len, int* idx,
size_t grad_value_size, long long len,
const cudaStream_t& stream); size_t grad_value_size,
const cudaStream_t& stream,
template void HeterCommKernel::merge_gradient<cudaStream_t>( CommonFeatureValueAccessor& feature_value_accessor);
template void
HeterCommKernel::merge_gradient<cudaStream_t, CommonFeatureValueAccessor>(
const uint32_t* offset, const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
...@@ -489,16 +516,18 @@ template void HeterCommKernel::merge_gradient<cudaStream_t>( ...@@ -489,16 +516,18 @@ template void HeterCommKernel::merge_gradient<cudaStream_t>(
int n, int n,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_, DynamicGradMerger& merger_,
const cudaStream_t& stream); const cudaStream_t& stream,
CommonFeatureValueAccessor& feature_value_accessor);
template void HeterCommKernel:: template void HeterCommKernel::
dy_mf_fill_dvals<paddle::framework::FeatureValue, int, cudaStream_t>( dy_mf_fill_dvals<int, cudaStream_t, CommonFeatureValueAccessor>(
paddle::framework::FeatureValue* d_shard_vals, float* d_shard_vals,
paddle::framework::FeatureValue* d_vals, float* d_vals,
int* idx, int* idx,
long long len, long long len,
size_t val_size, size_t val_size,
const cudaStream_t& stream); const cudaStream_t& stream,
CommonFeatureValueAccessor& feature_value_accessor);
#endif #endif
} // namespace framework } // namespace framework
......
...@@ -41,25 +41,16 @@ struct DynamicGradMerger { ...@@ -41,25 +41,16 @@ struct DynamicGradMerger {
return out; return out;
} }
template <typename T> template <typename FVAccessor>
__device__ __forceinline__ void update_one(T& output, const T& input) { __device__ __forceinline__ void update_one(
output.slot = input.slot; float* output, const float* input, FVAccessor& feature_value_accessor) {
output.show = input.show; feature_value_accessor.PushValueFill(output, input);
output.clk = input.clk;
output.mf_dim = input.mf_dim;
output.lr_g = input.lr_g;
for (int i = 0; i < output.mf_dim; ++i) {
output.mf_g[i] = input.mf_g[i];
}
} }
template <typename T>
__device__ __forceinline__ void merge_one(T& output, const T& input) { template <typename FVAccessor>
output.show += input.show; __device__ __forceinline__ void merge_one(
output.clk += input.clk; float* output, const float* input, FVAccessor& feature_value_accessor) {
output.lr_g += input.lr_g; feature_value_accessor.MergePushValue(output, input);
for (int i = 0; i < input.mf_dim; ++i) {
output.mf_g[i] += input.mf_g[i];
}
} }
}; };
...@@ -146,19 +137,20 @@ class HeterCommKernel { ...@@ -146,19 +137,20 @@ class HeterCommKernel {
bool debug_synchronous = false); bool debug_synchronous = false);
template <typename KeyType, template <typename KeyType,
typename GradType,
typename T, typename T,
typename StreamType> typename StreamType,
typename FVAccessor>
void dy_mf_fill_shard_grads(KeyType* d_shard_keys, void dy_mf_fill_shard_grads(KeyType* d_shard_keys,
KeyType* d_keys, KeyType* d_keys,
GradType* d_shard_grads, float* d_shard_grads,
GradType* d_grads, float* d_grads,
T* idx, T* idx,
long long len, long long len,
size_t grad_value_size, size_t grad_value_size,
const StreamType& stream); const StreamType& stream,
FVAccessor& feature_value_accessor);
template <typename StreamType> template <typename StreamType, typename FVAccessor>
void merge_gradient(const uint32_t* offset, void merge_gradient(const uint32_t* offset,
const uint32_t* fea_num, const uint32_t* fea_num,
const uint32_t* index, const uint32_t* index,
...@@ -167,15 +159,17 @@ class HeterCommKernel { ...@@ -167,15 +159,17 @@ class HeterCommKernel {
int n, int n,
size_t grad_value_size, size_t grad_value_size,
DynamicGradMerger& merger_, DynamicGradMerger& merger_,
const StreamType& stream); const StreamType& stream,
FVAccessor& feature_value_accessor);
template <typename ValType, typename T, typename StreamType> template <typename T, typename StreamType, typename FVAccessor>
void dy_mf_fill_dvals(ValType* d_shard_vals, void dy_mf_fill_dvals(float* d_shard_vals,
ValType* d_vals, float* d_vals,
T* idx, T* idx,
long long len, long long len,
size_t val_size, size_t val_size,
const StreamType& stream); const StreamType& stream,
FVAccessor& feature_value_accessor);
private: private:
int block_size_{256}; int block_size_{256};
......
...@@ -22,34 +22,43 @@ namespace paddle { ...@@ -22,34 +22,43 @@ namespace paddle {
namespace framework { namespace framework {
HeterPsBase* HeterPsBase::get_instance( HeterPsBase* HeterPsBase::get_instance(
size_t capacity, std::shared_ptr<HeterPsResource> resource) { size_t capacity,
return new HeterPs(capacity, resource); std::shared_ptr<HeterPsResource> resource,
std::unordered_map<std::string, float> fleet_config,
std::string accessor_type,
int optimizer_type) {
if (accessor_type == "CtrDymfAccessor" &&
(optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) {
return new HeterPs<CommonFeatureValueAccessor>(
capacity, resource, accessor_type, fleet_config, optimizer_type);
} else {
VLOG(0) << " HeterPsBase get_instance Warning: now only support "
"CtrDymfAccessor, but get "
<< accessor_type_;
return new HeterPs<CommonFeatureValueAccessor>(
capacity, resource, accessor_type, fleet_config, optimizer_type);
}
} }
HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) { HeterPs::HeterPs(size_t capacity,
comm_ = std::shared_ptr<HeterPsResource> resource,
std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>( std::unordered_map<std::string, float> fleet_config,
capacity, resource); std::string accessor_type,
int optimizer_type) {
comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
capacity, resource);
optimizer_type_ = optimizer_type;
} }
HeterPs::~HeterPs() {} HeterPs::~HeterPs() {}
void HeterPs::pull_sparse(int num, void HeterPs::pull_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeatureValue* d_vals, float* d_vals,
size_t len) { size_t len) {
comm_->pull_sparse(num, d_keys, d_vals, len); comm_->pull_sparse(num, d_keys, d_vals, len);
} }
void HeterPs::build_ps(int num,
FeatureKey* h_keys,
FeatureValue* h_vals,
size_t len,
size_t chunk_size,
int stream_num) {
comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num);
}
int HeterPs::get_index_by_devid(int devid) { int HeterPs::get_index_by_devid(int devid) {
return comm_->get_index_by_devid(devid); return comm_->get_index_by_devid(devid);
} }
...@@ -68,7 +77,7 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } ...@@ -68,7 +77,7 @@ void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); }
void HeterPs::push_sparse(int num, void HeterPs::push_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeaturePushValue* d_grads, float* d_grads,
size_t len) { size_t len) {
comm_->push_sparse(num, d_keys, d_grads, len); comm_->push_sparse(num, d_keys, d_grads, len);
// comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); // comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_);
......
...@@ -22,80 +22,139 @@ namespace paddle { ...@@ -22,80 +22,139 @@ namespace paddle {
namespace framework { namespace framework {
HeterPsBase* HeterPsBase::get_instance( HeterPsBase* HeterPsBase::get_instance(
size_t capacity, std::shared_ptr<HeterPsResource> resource) { size_t capacity,
return new HeterPs(capacity, resource); std::shared_ptr<HeterPsResource> resource,
std::unordered_map<std::string, float> fleet_config,
std::string accessor_type,
int optimizer_type) {
if (accessor_type == "CtrDymfAccessor" &&
(optimizer_type == 1 || optimizer_type == 3 || optimizer_type == 4)) {
return new HeterPs<CommonFeatureValueAccessor>(
capacity, resource, fleet_config, accessor_type, optimizer_type);
} else {
VLOG(0) << " HeterPsBase get_instance Warning: now only support "
"CtrDymfAccessor, but get "
<< accessor_type;
return new HeterPs<CommonFeatureValueAccessor>(
capacity, resource, fleet_config, accessor_type, optimizer_type);
}
} }
HeterPs::HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource) { template <typename FVAccessor>
comm_ = HeterPs<FVAccessor>::HeterPs(
std::make_shared<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>>( size_t capacity,
capacity, resource); std::shared_ptr<HeterPsResource> resource,
opt_ = Optimizer<FeatureValue, FeaturePushValue>(); std::unordered_map<std::string, float> fleet_config,
std::string accessor_type,
int optimizer_type) {
comm_ = std::make_shared<HeterComm<FeatureKey, float*, float*, FVAccessor>>(
capacity, resource);
feature_value_accessor_.Configure(fleet_config);
set_accessor(feature_value_accessor_);
accessor_type_ = accessor_type;
optimizer_type_ = optimizer_type;
} }
HeterPs::~HeterPs() {} template <typename FVAccessor>
HeterPs<FVAccessor>::~HeterPs() {}
void HeterPs::pull_sparse(int num, template <typename FVAccessor>
FeatureKey* d_keys, void HeterPs<FVAccessor>::pull_sparse(int num,
FeatureValue* d_vals, FeatureKey* d_keys,
size_t len) { float* d_vals,
size_t len) {
comm_->pull_sparse(num, d_keys, d_vals, len); comm_->pull_sparse(num, d_keys, d_vals, len);
} }
void HeterPs::build_ps(int num, template <typename FVAccessor>
FeatureKey* h_keys, void HeterPs<FVAccessor>::build_ps(int num,
FeatureValue* h_vals, FeatureKey* h_keys,
size_t len, char* pool,
size_t chunk_size, size_t len,
int stream_num) { size_t feature_value_size,
comm_->build_ps(num, h_keys, h_vals, len, chunk_size, stream_num); size_t chunk_size,
} int stream_num) {
void HeterPs::build_ps(int num,
FeatureKey* h_keys,
char* pool,
size_t len,
size_t feature_value_size,
size_t chunk_size,
int stream_num) {
comm_->build_ps( comm_->build_ps(
num, h_keys, pool, len, feature_value_size, chunk_size, stream_num); num, h_keys, pool, len, feature_value_size, chunk_size, stream_num);
} }
int HeterPs::get_index_by_devid(int devid) { template <typename FVAccessor>
int HeterPs<FVAccessor>::get_index_by_devid(int devid) {
return comm_->get_index_by_devid(devid); return comm_->get_index_by_devid(devid);
} }
void HeterPs::set_sparse_sgd(const OptimizerConfig& optimizer_config) { template <typename FVAccessor>
void HeterPs<FVAccessor>::set_sparse_sgd(
const OptimizerConfig& optimizer_config) {
comm_->set_sparse_sgd(optimizer_config); comm_->set_sparse_sgd(optimizer_config);
} }
void HeterPs::set_embedx_sgd(const OptimizerConfig& optimizer_config) { template <typename FVAccessor>
void HeterPs<FVAccessor>::set_embedx_sgd(
const OptimizerConfig& optimizer_config) {
comm_->set_embedx_sgd(optimizer_config); comm_->set_embedx_sgd(optimizer_config);
} }
void HeterPs::end_pass() { comm_->end_pass(); } template <typename FVAccessor>
void HeterPs<FVAccessor>::end_pass() {
comm_->end_pass();
}
void HeterPs::show_one_table(int gpu_num) { comm_->show_one_table(gpu_num); } template <typename FVAccessor>
void HeterPs<FVAccessor>::show_one_table(int gpu_num) {
comm_->show_one_table(gpu_num);
}
void HeterPs::push_sparse(int num, template <typename FVAccessor>
FeatureKey* d_keys, void HeterPs<FVAccessor>::push_sparse(int num,
FeaturePushValue* d_grads, FeatureKey* d_keys,
size_t len) { float* d_grads,
comm_->push_sparse(num, d_keys, d_grads, len, opt_); size_t len) {
// comm_->push_sparse_multi_node(num, d_keys, d_grads, len, opt_); if (accessor_type_ == "CtrDymfAccessor") {
if (optimizer_type_ == 3) { // adam
auto optimizer = SparseAdamOptimizer(feature_value_accessor_);
VLOG(5) << "INTO push_sparse SparseAdamOptimizer, EmbedDim():"
<< optimizer.EmbedDim();
comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
} else if (optimizer_type_ == 4) { // shared_adam
auto optimizer = SparseAdamSharedOptimizer(feature_value_accessor_);
VLOG(5) << "INTO push_sparse SparseAdamSharedOptimizer, EmbedDim():"
<< optimizer.EmbedDim();
comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
} else if (optimizer_type_ == 1) { // adagrad {
auto optimizer = SparseAdagradOptimizer(feature_value_accessor_);
VLOG(5) << "INTO push_sparse SparseAdagradOptimizer, EmbedDim():"
<< optimizer.EmbedDim();
comm_->push_sparse(num, d_keys, d_grads, len, optimizer);
} else {
VLOG(0) << " push sparse Error: CtrDymfAccessor only support adagrad(1),"
"adam(3) or shared_adam(4), bug get optimizer type:"
<< optimizer_type_;
}
} else {
VLOG(0) << " push sparse Error: now only support CtrDymfAccessor, but get "
<< accessor_type_;
}
} }
void HeterPs::set_nccl_comm_and_size(const std::vector<ncclComm_t>& inner_comms, template <typename FVAccessor>
const std::vector<ncclComm_t>& inter_comms, void HeterPs<FVAccessor>::set_nccl_comm_and_size(
int comm_size) { const std::vector<ncclComm_t>& inner_comms,
const std::vector<ncclComm_t>& inter_comms,
int comm_size) {
comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size); comm_->set_nccl_comm_and_size(inner_comms, inter_comms, comm_size);
} }
void HeterPs::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) { template <typename FVAccessor>
void HeterPs<FVAccessor>::set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) {
comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim); comm_->set_multi_mf_dim(multi_mf_dim, max_mf_dim);
} }
template <typename FVAccessor>
void HeterPs<FVAccessor>::set_accessor(FVAccessor& accessor) {
comm_->set_accessor(accessor);
}
} // end namespace framework } // end namespace framework
} // end namespace paddle } // end namespace paddle
#endif #endif
...@@ -26,24 +26,23 @@ limitations under the License. */ ...@@ -26,24 +26,23 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
template <typename FVAccessor>
class HeterPs : public HeterPsBase { class HeterPs : public HeterPsBase {
public: public:
HeterPs() {} HeterPs() {}
HeterPs(size_t capacity, std::shared_ptr<HeterPsResource> resource); HeterPs(size_t capacity,
std::shared_ptr<HeterPsResource> resource,
std::unordered_map<std::string, float> fleet_config,
std::string accessor_type,
int optimizer_type);
virtual ~HeterPs(); virtual ~HeterPs();
HeterPs(const HeterPs&) = delete; HeterPs(const HeterPs&) = delete;
HeterPs& operator=(const HeterPs&) = delete; HeterPs& operator=(const HeterPs&) = delete;
void pull_sparse(int num, void pull_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeatureValue* d_vals, float* d_vals,
size_t len) override; size_t len) override;
void build_ps(int num,
FeatureKey* h_keys,
FeatureValue* h_vals,
size_t len,
size_t chunk_size,
int stream_num) override;
void build_ps(int num, void build_ps(int num,
FeatureKey* h_keys, FeatureKey* h_keys,
char* pool, char* pool,
...@@ -56,6 +55,8 @@ class HeterPs : public HeterPsBase { ...@@ -56,6 +55,8 @@ class HeterPs : public HeterPsBase {
const std::vector<ncclComm_t>& inter_comms, const std::vector<ncclComm_t>& inter_comms,
int comm_size) override; int comm_size) override;
void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override; void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) override;
void set_accessor(FVAccessor& accessor);
#endif #endif
void set_sparse_sgd(const OptimizerConfig& optimizer_config) override; void set_sparse_sgd(const OptimizerConfig& optimizer_config) override;
...@@ -66,13 +67,15 @@ class HeterPs : public HeterPsBase { ...@@ -66,13 +67,15 @@ class HeterPs : public HeterPsBase {
void show_one_table(int gpu_num) override; void show_one_table(int gpu_num) override;
void push_sparse(int num, void push_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeaturePushValue* d_grads, float* d_grads,
size_t len) override; size_t len) override;
private: private:
std::shared_ptr<HeterComm<FeatureKey, FeatureValue, FeaturePushValue>> comm_; std::shared_ptr<HeterComm<FeatureKey, float*, float*, FVAccessor>> comm_;
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
Optimizer<FeatureValue, FeaturePushValue> opt_; FVAccessor feature_value_accessor_;
std::string accessor_type_;
int optimizer_type_;
#endif #endif
}; };
......
...@@ -34,14 +34,8 @@ class HeterPsBase { ...@@ -34,14 +34,8 @@ class HeterPsBase {
virtual void pull_sparse(int num, virtual void pull_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeatureValue* d_vals, float* d_vals,
size_t len) = 0; size_t len) = 0;
virtual void build_ps(int num,
FeatureKey* h_keys,
FeatureValue* h_vals,
size_t len,
size_t chunk_size,
int stream_num) = 0;
virtual void build_ps(int num, virtual void build_ps(int num,
FeatureKey* h_keys, FeatureKey* h_keys,
char* pool, char* pool,
...@@ -56,19 +50,25 @@ class HeterPsBase { ...@@ -56,19 +50,25 @@ class HeterPsBase {
const std::vector<ncclComm_t>& inter_comms, const std::vector<ncclComm_t>& inter_comms,
int comm_size) = 0; int comm_size) = 0;
virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0; virtual void set_multi_mf_dim(int multi_mf_dim, int max_mf_dim) = 0;
#endif #endif
virtual void end_pass() = 0; virtual void end_pass() = 0;
virtual void show_one_table(int gpu_num) = 0; virtual void show_one_table(int gpu_num) = 0;
virtual void push_sparse(int num, virtual void push_sparse(int num,
FeatureKey* d_keys, FeatureKey* d_keys,
FeaturePushValue* d_grads, float* d_grads,
size_t len) = 0; size_t len) = 0;
virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0; virtual void set_sparse_sgd(const OptimizerConfig& optimizer_config) = 0;
virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0; virtual void set_embedx_sgd(const OptimizerConfig& optimizer_config) = 0;
static HeterPsBase* get_instance(size_t capacity, static HeterPsBase* get_instance(
std::shared_ptr<HeterPsResource> resource); size_t capacity,
std::shared_ptr<HeterPsResource> resource,
// CommonFeatureValueAccessor feature_value_accessor,
std::unordered_map<std::string, float> fleet_config,
std::string accessor_type,
int optimizer_type);
}; };
} // end namespace framework } // end namespace framework
......
...@@ -82,20 +82,6 @@ class HBMMemoryPool : public managed { ...@@ -82,20 +82,6 @@ class HBMMemoryPool : public managed {
cudaMemset(mem_, 0, block_size_ * capacity); cudaMemset(mem_, 0, block_size_ * capacity);
} }
friend std::ostream& operator<<(std::ostream& out, HBMMemoryPool& p) {
for (size_t k = 0; k < 5; k++) {
auto x = (FeatureValue*)(p.mem() + k * p.capacity());
out << "show: " << x->show << " clk: " << x->clk << " slot: " << x->slot
<< " lr: " << x->lr << " mf_dim: " << x->mf_size
<< " mf_size: " << x->mf_size << " mf:";
for (int i = 0; i < x->mf_size + 1; ++i) {
out << " " << x->mf[i];
}
out << "\n";
}
return out;
}
char* mem() { return mem_; } char* mem() { return mem_; }
size_t capacity() { return capacity_; } size_t capacity() { return capacity_; }
......
...@@ -27,134 +27,460 @@ namespace paddle { ...@@ -27,134 +27,460 @@ namespace paddle {
namespace framework { namespace framework {
#if defined(PADDLE_WITH_CUDA) #if defined(PADDLE_WITH_CUDA)
template <typename ValType, typename GradType>
class Optimizer { class Optimizer {
public: public:
Optimizer() {} __host__ Optimizer(CommonFeatureValueAccessor feature_value_accessor) {
feature_value_accessor_ = feature_value_accessor;
}
__host__ ~Optimizer() {}
__device__ void update_value(const OptimizerConfig& optimizer_config,
float& val, // NOLINT
const float& grad) {
printf(
"Warning: update_value will not used. Please use dy_mf_update_value\n");
}
__device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
float* ptr,
const float* grad) {}
~Optimizer() {} CommonFeatureValueAccessor feature_value_accessor_;
void initialize() {} size_t _embedding_dim;
size_t _lr_embedding_dim;
};
class SparseAdagradOptimizer : public Optimizer {
public:
__host__ SparseAdagradOptimizer(
CommonFeatureValueAccessor feature_value_accessor)
: Optimizer(feature_value_accessor) {
_lr_embedding_dim = 1;
_embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim();
}
__device__ void update_value_work(const OptimizerConfig& optimizer_config,
int n,
float* w,
float* sgd, // NOLINT
const float* g,
float scale) {
float& g2sum = sgd[G2SumIndex()];
double add_g2sum = 0;
double ratio = optimizer_config.mf_learning_rate *
sqrt(optimizer_config.mf_initial_g2sum /
(optimizer_config.mf_initial_g2sum + g2sum));
for (int i = 0; i < n; ++i) {
double scaled_grad = g[i] / scale;
w[i] += scaled_grad * ratio;
if (w[i] < optimizer_config.mf_min_bound)
w[i] = optimizer_config.mf_min_bound;
if (w[i] > optimizer_config.mf_max_bound)
w[i] = optimizer_config.mf_max_bound;
add_g2sum += scaled_grad * scaled_grad;
}
g2sum += add_g2sum / n;
}
__device__ void update_value(const OptimizerConfig& optimizer_config,
float& val, // NOLINT
const float& grad) {
printf(
"Warning: update_value will not used. Please use dy_mf_update_value\n");
}
__device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
float* ptr,
const float* grad) {
float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()];
float g_click =
grad[feature_value_accessor_.common_push_value.ClickIndex()];
ptr[feature_value_accessor_.common_feature_value.SlotIndex()] =
grad[feature_value_accessor_.common_push_value.SlotIndex()];
ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show;
ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click;
ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
optimizer_config.nonclk_coeff * (g_show - g_click) +
optimizer_config.clk_coeff * g_click;
update_value_work(
optimizer_config,
1,
ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(),
ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(),
grad + feature_value_accessor_.common_push_value.EmbedGIndex(),
g_show);
int mf_dim =
int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]);
if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
if (optimizer_config.mf_create_thresholds <=
optimizer_config.nonclk_coeff *
(ptr[feature_value_accessor_.common_feature_value
.ShowIndex()] -
ptr[feature_value_accessor_.common_feature_value
.ClickIndex()]) +
optimizer_config.clk_coeff *
ptr[feature_value_accessor_.common_feature_value
.ClickIndex()]) {
ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] =
feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
sizeof(float);
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
curandState state;
curand_init(clock64(), tid_x, 0, &state);
for (int i = 0; i < mf_dim; ++i) {
ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] =
(curand_uniform(&state)) * optimizer_config.mf_initial_range;
}
}
} else {
update_value_work(
optimizer_config,
mf_dim,
ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(),
ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(),
grad + feature_value_accessor_.common_push_value.EmbedxGIndex(),
g_show);
}
}
__host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); }
__host__ __device__ size_t EmbedDim() { return _lr_embedding_dim; }
__host__ __device__ size_t EmbedxDim() { return _embedding_dim; }
__host__ __device__ size_t G2SumIndex() { return 0; }
__host__ __device__ size_t EmbedxG2SumIndex() { return 0; }
};
class SparseAdamOptimizer : public Optimizer {
public:
__host__ SparseAdamOptimizer(
CommonFeatureValueAccessor feature_value_accessor)
: Optimizer(feature_value_accessor) {
_lr_embedding_dim = 1;
_embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim();
}
__device__ void update_lr(const OptimizerConfig& optimizer_config, __device__ void update_lr(const OptimizerConfig& optimizer_config,
float& w, // NOLINT int n,
float& g2sum, float* w,
float g, // NOLINT float* sgd,
const float* g,
float scale) { float scale) {
double add_g2sum = 0; float* moment1 = sgd + GSumIndex();
double ratio = optimizer_config.learning_rate * float* moment2 = sgd + G2SumIndex();
sqrt(optimizer_config.initial_g2sum / float* beta1_pow = sgd + Beta1PowIndex();
(optimizer_config.initial_g2sum + g2sum)); float* beta2_pow = sgd + Beta2PowIndex();
double scaled_grad = g / scale;
w += scaled_grad * ratio; float beta1_pow_ = *beta1_pow;
float beta2_pow_ = *beta2_pow;
if (w < optimizer_config.min_bound) w = optimizer_config.min_bound; float epsilon = 1e-08;
if (w > optimizer_config.max_bound) w = optimizer_config.max_bound; double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) /
(1.0 - beta1_pow_);
for (int i = 0; i < n; ++i) {
double scaled_grad = g[i] / scale;
add_g2sum += scaled_grad * scaled_grad; double new_moment1 =
optimizer_config.beta1_decay_rate * moment1[i] +
(1.0 - optimizer_config.beta1_decay_rate) * scaled_grad;
double new_moment2 =
optimizer_config.beta2_decay_rate * moment2[i] +
(1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad;
w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon));
if (w[i] < optimizer_config.mf_min_bound)
w[i] = optimizer_config.mf_min_bound;
if (w[i] > optimizer_config.mf_max_bound)
w[i] = optimizer_config.mf_max_bound;
g2sum += add_g2sum; moment1[i] = new_moment1;
moment2[i] = new_moment2;
}
(*beta1_pow) *= optimizer_config.beta1_decay_rate;
(*beta2_pow) *= optimizer_config.beta2_decay_rate;
} }
__device__ void update_mf(const OptimizerConfig& optimizer_config, __device__ void update_mf(const OptimizerConfig& optimizer_config,
int n, int n,
float* w, float* w,
float& g2sum, // NOLINT float* sgd,
const float* g, const float* g,
float scale) { float scale) {
double add_g2sum = 0; float* moment1 = sgd + EmbedxGSumIndex();
double ratio = optimizer_config.mf_learning_rate * float* moment2 = sgd + EmbedxG2SumIndex();
sqrt(optimizer_config.mf_initial_g2sum / float* beta1_pow = sgd + EmbedxBeta1PowIndex();
(optimizer_config.mf_initial_g2sum + g2sum)); float* beta2_pow = sgd + EmbedxBeta2PowIndex();
float beta1_pow_ = *beta1_pow;
float beta2_pow_ = *beta2_pow;
float epsilon = 1e-08;
double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) /
(1.0 - beta1_pow_);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
double scaled_grad = g[i] / scale; double scaled_grad = g[i] / scale;
w[i] += scaled_grad * ratio; double new_moment1 =
optimizer_config.beta1_decay_rate * moment1[i] +
(1.0 - optimizer_config.beta1_decay_rate) * scaled_grad;
double new_moment2 =
optimizer_config.beta2_decay_rate * moment2[i] +
(1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad;
w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon));
if (w[i] < optimizer_config.mf_min_bound) if (w[i] < optimizer_config.mf_min_bound)
w[i] = optimizer_config.mf_min_bound; w[i] = optimizer_config.mf_min_bound;
if (w[i] > optimizer_config.mf_max_bound) if (w[i] > optimizer_config.mf_max_bound)
w[i] = optimizer_config.mf_max_bound; w[i] = optimizer_config.mf_max_bound;
add_g2sum += scaled_grad * scaled_grad;
}
g2sum += add_g2sum / n; moment1[i] = new_moment1;
moment2[i] = new_moment2;
}
(*beta1_pow) *= optimizer_config.beta1_decay_rate;
(*beta2_pow) *= optimizer_config.beta2_decay_rate;
} }
__device__ void update_value(const OptimizerConfig& optimizer_config, __device__ void update_value(const OptimizerConfig& optimizer_config,
ValType& val, // NOLINT float& val, // NOLINT
const GradType& grad) { const float& grad) {
val.slot = grad.slot; printf(
val.show += grad.show; "Warning: update_value will not used. Please use dy_mf_update_value\n");
val.clk += grad.clk; }
val.delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) + __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
optimizer_config.clk_coeff * grad.clk; float* ptr,
const float* grad) {
float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()];
float g_click =
grad[feature_value_accessor_.common_push_value.ClickIndex()];
update_lr(optimizer_config, val.lr, val.lr_g2sum, grad.lr_g, grad.show); ptr[feature_value_accessor_.common_feature_value.SlotIndex()] =
grad[feature_value_accessor_.common_push_value.SlotIndex()];
ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show;
ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click;
ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
optimizer_config.nonclk_coeff * (g_show - g_click) +
optimizer_config.clk_coeff * g_click;
if (val.mf_size == 0) { update_lr(
optimizer_config,
1,
ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(),
ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(),
grad + feature_value_accessor_.common_push_value.EmbedGIndex(),
g_show);
int mf_dim =
int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]);
if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
if (optimizer_config.mf_create_thresholds <= if (optimizer_config.mf_create_thresholds <=
optimizer_config.nonclk_coeff * (val.show - val.clk) + optimizer_config.nonclk_coeff *
optimizer_config.clk_coeff * val.clk) { (ptr[feature_value_accessor_.common_feature_value
val.mf_size = MF_DIM + 1; .ShowIndex()] -
val.mf[0] = 0; ptr[feature_value_accessor_.common_feature_value
.ClickIndex()]) +
optimizer_config.clk_coeff *
ptr[feature_value_accessor_.common_feature_value
.ClickIndex()]) {
ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] =
feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
sizeof(float);
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
curandState state; curandState state;
curand_init(clock64(), tid_x, 0, &state); curand_init(clock64(), tid_x, 0, &state);
for (int i = 0; i < MF_DIM; ++i) { for (int i = 0; i < mf_dim; ++i) {
val.mf[i + 1] = ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] =
(curand_uniform(&state)) * optimizer_config.mf_initial_range; (curand_uniform(&state)) * optimizer_config.mf_initial_range;
} }
ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate;
ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate;
} }
} else { } else {
update_mf(optimizer_config, update_mf(
MF_DIM, optimizer_config,
&val.mf[1], mf_dim,
val.mf[0], ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(),
grad.mf_g, ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(),
grad.show); grad + feature_value_accessor_.common_push_value.EmbedxGIndex(),
g_show);
}
// printf("EmbedxGIndex: %f, mf_gsum: %f, ",
// feature_value_accessor_.common_push_value.EmbedxGIndex(),
// ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex()]);
}
__host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); }
__host__ __device__ size_t EmbedDim() { return _lr_embedding_dim * 2 + 2; }
__host__ __device__ size_t EmbedxDim() { return _embedding_dim * 2 + 2; }
__host__ __device__ size_t GSumIndex() { return 0; }
__host__ __device__ size_t G2SumIndex() {
return GSumIndex() + _lr_embedding_dim;
}
__host__ __device__ size_t Beta1PowIndex() {
return G2SumIndex() + _lr_embedding_dim;
}
__host__ __device__ size_t Beta2PowIndex() { return Beta1PowIndex() + 1; }
__host__ __device__ size_t EmbedxGSumIndex() { return 0; }
__host__ __device__ size_t EmbedxG2SumIndex() {
return EmbedxGSumIndex() + _embedding_dim;
}
__host__ __device__ size_t EmbedxBeta1PowIndex() {
return EmbedxG2SumIndex() + _embedding_dim;
}
__host__ __device__ size_t EmbedxBeta2PowIndex() {
return EmbedxBeta1PowIndex() + 1;
}
};
class SparseAdamSharedOptimizer : public Optimizer {
public:
__host__ SparseAdamSharedOptimizer(
CommonFeatureValueAccessor feature_value_accessor)
: Optimizer(feature_value_accessor) {
_lr_embedding_dim = 1;
_embedding_dim = feature_value_accessor_.common_feature_value.EmbedWDim();
}
__device__ void update_value_work(const OptimizerConfig& optimizer_config,
int n,
float* w,
float* sgd,
const float* g,
float scale) {
float* moment1 = sgd + GSumIndex();
float* moment2 = sgd + G2SumIndex();
float* beta1_pow = sgd + Beta1PowIndex();
float* beta2_pow = sgd + Beta2PowIndex();
float beta1_pow_ = *beta1_pow;
float beta2_pow_ = *beta2_pow;
float moment1_ = *moment1;
float moment2_ = *moment2;
float epsilon = 1e-08;
double ratio = optimizer_config.learning_rate * sqrt(1.0 - beta2_pow_) /
(1.0 - beta1_pow_);
double sum_mom1 = 0.0;
double sum_mom2 = 0.0;
for (int i = 0; i < n; ++i) {
double scaled_grad = g[i] / scale;
double new_moment1 =
optimizer_config.beta1_decay_rate * moment1_ +
(1.0 - optimizer_config.beta1_decay_rate) * scaled_grad;
double new_moment2 =
optimizer_config.beta2_decay_rate * moment2_ +
(1.0 - optimizer_config.beta2_decay_rate) * scaled_grad * scaled_grad;
w[i] += ratio * (new_moment1 / (sqrt(new_moment2) + epsilon));
if (w[i] < optimizer_config.mf_min_bound)
w[i] = optimizer_config.mf_min_bound;
if (w[i] > optimizer_config.mf_max_bound)
w[i] = optimizer_config.mf_max_bound;
sum_mom1 += new_moment1;
sum_mom2 += new_moment2;
} }
(*moment1) = sum_mom1 / n;
(*moment2) = sum_mom2 / n;
(*beta1_pow) *= optimizer_config.beta1_decay_rate;
(*beta2_pow) *= optimizer_config.beta2_decay_rate;
}
__device__ void update_value(const OptimizerConfig& optimizer_config,
float& val, // NOLINT
const float& grad) {
printf(
"Warning: update_value will not used. Please use dy_mf_update_value\n");
} }
__device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config, __device__ void dy_mf_update_value(const OptimizerConfig& optimizer_config,
ValType* ptr, float* ptr,
const GradType& grad) { const float* grad) {
ptr->slot = grad.slot; float g_show = grad[feature_value_accessor_.common_push_value.ShowIndex()];
ptr->show += grad.show; float g_click =
ptr->clk += grad.clk; grad[feature_value_accessor_.common_push_value.ClickIndex()];
ptr->delta_score += optimizer_config.nonclk_coeff * (grad.show - grad.clk) +
optimizer_config.clk_coeff * grad.clk; ptr[feature_value_accessor_.common_feature_value.SlotIndex()] =
grad[feature_value_accessor_.common_push_value.SlotIndex()];
update_lr(optimizer_config, ptr->lr, ptr->lr_g2sum, grad.lr_g, grad.show); ptr[feature_value_accessor_.common_feature_value.ShowIndex()] += g_show;
// use MF_DIM temporarily ptr[feature_value_accessor_.common_feature_value.ClickIndex()] += g_click;
// ptr->mf_dim = grad.mf_dim; ptr[feature_value_accessor_.common_feature_value.DeltaScoreIndex()] +=
optimizer_config.nonclk_coeff * (g_show - g_click) +
if (ptr->mf_size == 0) { optimizer_config.clk_coeff * g_click;
update_value_work(
optimizer_config,
1,
ptr + feature_value_accessor_.common_feature_value.EmbedWIndex(),
ptr + feature_value_accessor_.common_feature_value.EmbedG2SumIndex(),
grad + feature_value_accessor_.common_push_value.EmbedGIndex(),
g_show);
int mf_dim =
int(ptr[feature_value_accessor_.common_feature_value.MfDimIndex()]);
if (ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] == 0) {
if (optimizer_config.mf_create_thresholds <= if (optimizer_config.mf_create_thresholds <=
optimizer_config.nonclk_coeff * (ptr->show - ptr->clk) + optimizer_config.nonclk_coeff *
optimizer_config.clk_coeff * ptr->clk) { (ptr[feature_value_accessor_.common_feature_value
ptr->mf_size = ptr->mf_dim + 1; .ShowIndex()] -
ptr[feature_value_accessor_.common_feature_value
.ClickIndex()]) +
optimizer_config.clk_coeff *
ptr[feature_value_accessor_.common_feature_value
.ClickIndex()]) {
ptr[feature_value_accessor_.common_feature_value.MfSizeIndex()] =
feature_value_accessor_.common_feature_value.MFSize(mf_dim) /
sizeof(float);
// ptr->mf_size = MF_DIM + 1;
ptr->mf[0] = 0;
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
curandState state; curandState state;
curand_init(clock64(), tid_x, 0, &state); curand_init(clock64(), tid_x, 0, &state);
for (int i = 0; i < ptr->mf_dim; ++i) { for (int i = 0; i < mf_dim; ++i) {
ptr->mf[i + 1] = ptr[feature_value_accessor_.common_feature_value.EmbedxWIndex() + i] =
(curand_uniform(&state)) * optimizer_config.mf_initial_range; (curand_uniform(&state)) * optimizer_config.mf_initial_range;
} }
ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
EmbedxBeta1PowIndex()] = optimizer_config.beta1_decay_rate;
ptr[feature_value_accessor_.common_feature_value.EmbedxG2SumIndex() +
EmbedxBeta2PowIndex()] = optimizer_config.beta2_decay_rate;
} }
} else { } else {
update_mf(optimizer_config, update_value_work(
ptr->mf_dim, optimizer_config,
&(ptr->mf[1]), mf_dim,
ptr->mf[0], ptr + feature_value_accessor_.common_feature_value.EmbedxWIndex(),
grad.mf_g, ptr + feature_value_accessor_.common_feature_value.EmbedxG2SumIndex(),
grad.show); // for local test grad + feature_value_accessor_.common_push_value.EmbedxGIndex(),
g_show);
} }
} }
__host__ __device__ size_t Dim() { return EmbedDim() + EmbedxDim(); }
__host__ __device__ size_t EmbedDim() { return 4; }
__host__ __device__ size_t EmbedxDim() { return 4; }
__host__ __device__ size_t GSumIndex() { return 0; }
__host__ __device__ size_t G2SumIndex() { return GSumIndex() + 1; }
__host__ __device__ size_t Beta1PowIndex() { return G2SumIndex() + 1; }
__host__ __device__ size_t Beta2PowIndex() { return Beta1PowIndex() + 1; }
__host__ __device__ size_t EmbedxGSumIndex() { return 0; }
__host__ __device__ size_t EmbedxG2SumIndex() {
return EmbedxGSumIndex() + 1;
}
__host__ __device__ size_t EmbedxBeta1PowIndex() {
return EmbedxG2SumIndex() + 1;
}
__host__ __device__ size_t EmbedxBeta2PowIndex() {
return EmbedxBeta1PowIndex() + 1;
}
}; };
#endif #endif
......
...@@ -27,13 +27,19 @@ class OptimizerConfig { ...@@ -27,13 +27,19 @@ class OptimizerConfig {
float learning_rate = 0.05; float learning_rate = 0.05;
float initial_g2sum = 3.0; float initial_g2sum = 3.0;
float initial_range = 0; float initial_range = 0;
float beta1_decay_rate = 0.9; // adam
float beta2_decay_rate = 0.999; // adam
float ada_epsilon = 1e-8;
float mf_create_thresholds = 10; float mf_create_thresholds = 10;
float mf_learning_rate = 0.05; float mf_learning_rate = 0.05;
float mf_initial_g2sum = 3.0; float mf_initial_g2sum = 3.0;
float mf_initial_range = 1e-4; float mf_initial_range = 1e-4;
float mf_beta1_decay_rate = 0.9; // adam
float mf_beta2_decay_rate = 0.999; // adam
float mf_min_bound = -10; float mf_min_bound = -10;
float mf_max_bound = 10; float mf_max_bound = 10;
float mf_ada_epsilon = 1e-8;
void set_sparse_sgd(float nonclk_coeff, void set_sparse_sgd(float nonclk_coeff,
float clk_coeff, float clk_coeff,
...@@ -41,7 +47,10 @@ class OptimizerConfig { ...@@ -41,7 +47,10 @@ class OptimizerConfig {
float max_bound, float max_bound,
float learning_rate, float learning_rate,
float initial_g2sum, float initial_g2sum,
float initial_range) { float initial_range,
float beta1_decay_rate,
float beta2_decay_rate,
float ada_epsilon) {
this->nonclk_coeff = nonclk_coeff; this->nonclk_coeff = nonclk_coeff;
this->clk_coeff = clk_coeff; this->clk_coeff = clk_coeff;
this->min_bound = min_bound; this->min_bound = min_bound;
...@@ -49,6 +58,9 @@ class OptimizerConfig { ...@@ -49,6 +58,9 @@ class OptimizerConfig {
this->learning_rate = learning_rate; this->learning_rate = learning_rate;
this->initial_g2sum = initial_g2sum; this->initial_g2sum = initial_g2sum;
this->initial_range = initial_range; this->initial_range = initial_range;
this->beta1_decay_rate = beta1_decay_rate;
this->beta2_decay_rate = beta2_decay_rate;
this->ada_epsilon = ada_epsilon;
} }
void set_sparse_sgd(const OptimizerConfig& optimizer_config) { void set_sparse_sgd(const OptimizerConfig& optimizer_config) {
...@@ -59,6 +71,9 @@ class OptimizerConfig { ...@@ -59,6 +71,9 @@ class OptimizerConfig {
this->learning_rate = optimizer_config.learning_rate; this->learning_rate = optimizer_config.learning_rate;
this->initial_g2sum = optimizer_config.initial_g2sum; this->initial_g2sum = optimizer_config.initial_g2sum;
this->initial_range = optimizer_config.initial_range; this->initial_range = optimizer_config.initial_range;
this->beta1_decay_rate = optimizer_config.beta1_decay_rate;
this->beta2_decay_rate = optimizer_config.beta2_decay_rate;
this->ada_epsilon = optimizer_config.ada_epsilon;
} }
void set_embedx_sgd(float mf_create_thresholds, void set_embedx_sgd(float mf_create_thresholds,
...@@ -66,13 +81,19 @@ class OptimizerConfig { ...@@ -66,13 +81,19 @@ class OptimizerConfig {
float mf_initial_g2sum, float mf_initial_g2sum,
float mf_initial_range, float mf_initial_range,
float mf_min_bound, float mf_min_bound,
float mf_max_bound) { float mf_max_bound,
float mf_beta1_decay_rate,
float mf_beta2_decay_rate,
float mf_ada_epsilon) {
this->mf_create_thresholds = mf_create_thresholds; this->mf_create_thresholds = mf_create_thresholds;
this->mf_learning_rate = mf_learning_rate; this->mf_learning_rate = mf_learning_rate;
this->mf_initial_g2sum = mf_initial_g2sum; this->mf_initial_g2sum = mf_initial_g2sum;
this->mf_initial_range = mf_initial_range; this->mf_initial_range = mf_initial_range;
this->mf_min_bound = mf_min_bound; this->mf_min_bound = mf_min_bound;
this->mf_max_bound = mf_max_bound; this->mf_max_bound = mf_max_bound;
this->mf_beta1_decay_rate = mf_beta1_decay_rate;
this->mf_beta2_decay_rate = mf_beta2_decay_rate;
this->mf_ada_epsilon = mf_ada_epsilon;
} }
void set_embedx_sgd(const OptimizerConfig& optimizer_config) { void set_embedx_sgd(const OptimizerConfig& optimizer_config) {
...@@ -82,6 +103,9 @@ class OptimizerConfig { ...@@ -82,6 +103,9 @@ class OptimizerConfig {
this->mf_initial_range = optimizer_config.mf_initial_range; this->mf_initial_range = optimizer_config.mf_initial_range;
this->mf_min_bound = optimizer_config.mf_min_bound; this->mf_min_bound = optimizer_config.mf_min_bound;
this->mf_max_bound = optimizer_config.mf_max_bound; this->mf_max_bound = optimizer_config.mf_max_bound;
this->mf_beta1_decay_rate = optimizer_config.mf_beta1_decay_rate;
this->mf_beta2_decay_rate = optimizer_config.mf_beta2_decay_rate;
this->mf_ada_epsilon = optimizer_config.mf_ada_epsilon;
} }
}; };
......
...@@ -33,9 +33,9 @@ limitations under the License. */ ...@@ -33,9 +33,9 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <deque> #include <deque>
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/platform/timer.h"
#if defined(PADDLE_WITH_PSCORE) #if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
#include "paddle/fluid/distributed/ps/table/depends/feature_value.h" #include "paddle/fluid/distributed/ps/table/depends/feature_value.h"
#endif #endif
...@@ -135,7 +135,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -135,7 +135,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
std::string data_set_name = std::string(typeid(*dataset_).name()); std::string data_set_name = std::string(typeid(*dataset_).name());
if (data_set_name.find("SlotRecordDataset") != std::string::npos) { if (data_set_name.find("SlotRecordDataset") != std::string::npos) {
SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_); SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
auto input_channel = dataset->GetInputChannel(); auto input_channel = dataset->GetInputChannel();
VLOG(0) << "psgpu wrapperinputslotchannle size: " << input_channel->Size(); VLOG(0) << "psgpu wrapperinputslotchannle size: " << input_channel->Size();
const std::deque<SlotRecord>& vec_data = input_channel->GetData(); const std::deque<SlotRecord>& vec_data = input_channel->GetData();
...@@ -185,7 +185,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -185,7 +185,7 @@ void PSGPUWrapper::PreBuildTask(std::shared_ptr<HeterContext> gpu_task) {
} else { } else {
CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos);
VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset";
MultiSlotDataset* dataset = dynamic_cast<MultiSlotDataset*>(dataset_); MultiSlotDataset* dataset = (MultiSlotDataset*)(dataset_);
auto input_channel = dataset->GetInputChannel(); auto input_channel = dataset->GetInputChannel();
const std::deque<Record>& vec_data = input_channel->GetData(); const std::deque<Record>& vec_data = input_channel->GetData();
...@@ -540,17 +540,17 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) { ...@@ -540,17 +540,17 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
&device_vals, &device_vals,
&device_task_keys, &device_task_keys,
&device_task_ptrs](int dev, int shard_id) { &device_task_ptrs](int dev, int shard_id) {
auto& task_keys = device_task_keys[shard_id]; // auto& task_keys = device_task_keys[shard_id];
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
auto& task_ptrs = device_task_ptrs[shard_id]; auto& task_ptrs = device_task_ptrs[shard_id];
#endif #endif
#ifdef PADDLE_WITH_PSCORE // #ifdef PADDLE_WITH_PSCORE
auto& task_ptrs = device_task_ptrs[shard_id]; // auto& task_ptrs = device_task_ptrs[shard_id];
#endif // #endif
int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id]; // int len = prefix_sum[dev][shard_id + 1] - prefix_sum[dev][shard_id];
int cur = prefix_sum[dev][shard_id]; // int cur = prefix_sum[dev][shard_id];
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
for (int j = 0; j < len; ++j) { for (int j = 0; j < len; ++j) {
device_keys[dev][cur + j] = task_keys[dev][j]; device_keys[dev][cur + j] = task_keys[dev][j];
...@@ -579,33 +579,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) { ...@@ -579,33 +579,6 @@ void PSGPUWrapper::BuildPull(std::shared_ptr<HeterContext> gpu_task) {
} }
} }
#endif #endif
#ifdef PADDLE_WITH_PSCORE
for (int j = 0; j < len; ++j) {
device_keys[dev][cur + j] = task_keys[dev][j];
float* ptr_val = task_ptrs[dev][j]->data();
FeatureValue& val = device_vals[dev][cur + j];
size_t dim = task_ptrs[dev][j]->size();
val.delta_score = ptr_val[2];
val.show = ptr_val[3];
val.clk = ptr_val[4];
val.slot = ptr_val[0];
val.lr = ptr_val[5];
val.lr_g2sum = ptr_val[6];
val.cpu_ptr = (uint64_t)(task_ptrs[dev][j]);
if (dim > 7) {
val.mf_size = MF_DIM + 1;
for (int x = 0; x < val.mf_size; x++) {
val.mf[x] = ptr_val[x + 7];
}
} else {
val.mf_size = 0;
for (int x = 0; x < MF_DIM + 1; x++) {
val.mf[x] = 0;
}
}
}
#endif
VLOG(3) << "GpuPs build hbmps done"; VLOG(3) << "GpuPs build hbmps done";
}; };
...@@ -665,16 +638,25 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -665,16 +638,25 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
return; return;
} }
std::vector<std::thread> threads(device_num); std::vector<std::thread> threads(device_num);
HeterPs_ = HeterPsBase::get_instance(size_max, resource_); auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
HeterPs_ = HeterPsBase::get_instance(
size_max, resource_, fleet_config_, accessor_class_, optimizer_type_);
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_); HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_);
HeterPs_->set_sparse_sgd(optimizer_config_);
HeterPs_->set_embedx_sgd(optimizer_config_);
#endif #endif
auto build_dymf_mem_pool = [this, &gpu_task](int i, int j) { auto build_dymf_mem_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i,
int j) {
this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
int mf_dim = this->index_dim_vec_[j]; int mf_dim = this->index_dim_vec_[j];
VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim
<< " feature_value_size:"
<< accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
size_t feature_value_size = size_t feature_value_size =
TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
size_t len = device_dim_keys.size(); size_t len = device_dim_keys.size();
...@@ -682,12 +664,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -682,12 +664,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
this->mem_pools_[i * this->multi_mf_dim_ + j] = this->mem_pools_[i * this->multi_mf_dim_ + j] =
new MemoryPool(len, feature_value_size); new MemoryPool(len, feature_value_size);
}; };
auto build_dymf_hbm_pool = [this, &gpu_task](int i, int j) { auto build_dymf_hbm_pool = [this, &gpu_task, &accessor_wrapper_ptr](int i,
int j) {
auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
size_t len = device_dim_keys.size(); size_t len = device_dim_keys.size();
int mf_dim = this->index_dim_vec_[j]; int mf_dim = this->index_dim_vec_[j];
size_t feature_value_size = size_t feature_value_size =
TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j]; auto& mem_pool = this->mem_pools_[i * this->multi_mf_dim_ + j];
platform::CUDADeviceGuard guard(resource_->dev_id(i)); platform::CUDADeviceGuard guard(resource_->dev_id(i));
...@@ -710,13 +693,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -710,13 +693,13 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
delete mem_pool; delete mem_pool;
}; };
int thread_num = 16; int thread_num = 16;
auto build_dynamic_mf_func = [this, &gpu_task, thread_num]( auto build_dynamic_mf_func = [this,
int i, int j, int z) { &gpu_task,
thread_num,
&accessor_wrapper_ptr](int i, int j, int z) {
// this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_); // this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
int mf_dim = this->index_dim_vec_[j]; int mf_dim = this->index_dim_vec_[j];
VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim; VLOG(0) << "building table: " << i << "with mf dim: " << mf_dim;
// size_t feature_value_size =
// TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
auto& device_dim_keys = gpu_task->device_dim_keys_[i][j]; auto& device_dim_keys = gpu_task->device_dim_keys_[i][j];
auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j]; auto& device_dim_ptrs = gpu_task->device_dim_ptr_[i][j];
size_t len = device_dim_keys.size(); size_t len = device_dim_keys.size();
...@@ -743,10 +726,10 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -743,10 +726,10 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
// ============ add for multi-thread ================ // ============ add for multi-thread ================
for (size_t k = left; k < right; k++) { for (size_t k = left; k < right; k++) {
FeatureValue* val = (FeatureValue*)(mem_pool->mem_address(k)); #ifdef PADDLE_WITH_PSLIB
float* val = (float*)(mem_pool->mem_address(k));
float* ptr_val = device_dim_ptrs[k]->data(); float* ptr_val = device_dim_ptrs[k]->data();
size_t dim = device_dim_ptrs[k]->size(); size_t dim = device_dim_ptrs[k]->size();
#ifdef PADDLE_WITH_PSLIB
val->delta_score = val->delta_score =
ptr_val[paddle::ps::DownpourCtrDymfAccessor:: ptr_val[paddle::ps::DownpourCtrDymfAccessor::
DownpourCtrDymfFeatureValue::delta_score_index()]; DownpourCtrDymfFeatureValue::delta_score_index()];
...@@ -765,23 +748,6 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -765,23 +748,6 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: ptr_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
mf_dim_index()] = float(mf_dim); mf_dim_index()] = float(mf_dim);
val->mf_dim = mf_dim; val->mf_dim = mf_dim;
#endif
#ifdef PADDLE_WITH_PSCORE
paddle::distributed::CtrDymfAccessor accessor;
val->delta_score =
ptr_val[accessor.common_feature_value.DeltaScoreIndex()];
val->show = ptr_val[accessor.common_feature_value.ShowIndex()];
val->clk = ptr_val[accessor.common_feature_value.ClickIndex()];
val->slot = int(ptr_val[accessor.common_feature_value.SlotIndex()]);
val->lr = ptr_val[accessor.common_feature_value.EmbedWIndex()];
val->lr_g2sum = ptr_val[accessor.common_feature_value.EmbedG2SumIndex()];
val->cpu_ptr = (uint64_t)(device_dim_ptrs[k]);
// TODO(xuefeng) set mf_dim while using DownpourCtrDymfAccessor
ptr_val[accessor.common_feature_value.MfDimIndex()] = float(mf_dim);
val->mf_dim = mf_dim;
#endif
if (dim > 8) { // CpuPS alreay expand as mf_dim if (dim > 8) { // CpuPS alreay expand as mf_dim
val->mf_size = mf_dim + 1; val->mf_size = mf_dim + 1;
for (int x = 0; x < val->mf_dim + 1; x++) { for (int x = 0; x < val->mf_dim + 1; x++) {
...@@ -793,6 +759,12 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) { ...@@ -793,6 +759,12 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
val->mf[x] = 0; val->mf[x] = 0;
} }
} }
#endif
#ifdef PADDLE_WITH_PSCORE
void* val = mem_pool->mem_address(k);
accessor_wrapper_ptr->BuildFill(
val, device_dim_ptrs[k], cpu_table_accessor_, mf_dim);
#endif
} }
}; };
...@@ -945,7 +917,10 @@ void PSGPUWrapper::EndPass() { ...@@ -945,7 +917,10 @@ void PSGPUWrapper::EndPass() {
} }
} }
int thread_num = 8; int thread_num = 8;
auto dump_pool_to_cpu_func = [this, thread_num](int i, int j, int z) { auto accessor_wrapper_ptr =
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
auto dump_pool_to_cpu_func = [this, thread_num, &accessor_wrapper_ptr](
int i, int j, int z) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i))); PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j]; auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
auto& device_keys = this->current_task_->device_dim_keys_[i][j]; auto& device_keys = this->current_task_->device_dim_keys_[i][j];
...@@ -965,9 +940,11 @@ void PSGPUWrapper::EndPass() { ...@@ -965,9 +940,11 @@ void PSGPUWrapper::EndPass() {
} }
// ============ multi-thread process feasign============ // ============ multi-thread process feasign============
int mf_dim = this->index_dim_vec_[j]; int mf_dim = this->index_dim_vec_[j];
VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim;
size_t feature_value_size = size_t feature_value_size =
TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float))); accessor_wrapper_ptr->GetFeatureValueSize(mf_dim);
VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim
<< " key_len :" << len
<< " feature_value_size:" << feature_value_size;
char* test_build_values = (char*)malloc(feature_value_size * real_len); char* test_build_values = (char*)malloc(feature_value_size * real_len);
uint64_t offset = left * feature_value_size; uint64_t offset = left * feature_value_size;
cudaMemcpy(test_build_values, cudaMemcpy(test_build_values,
...@@ -981,7 +958,7 @@ void PSGPUWrapper::EndPass() { ...@@ -981,7 +958,7 @@ void PSGPUWrapper::EndPass() {
continue; continue;
} }
size_t local_offset = (i - left) * feature_value_size; size_t local_offset = (i - left) * feature_value_size;
FeatureValue* gpu_val = (FeatureValue*)(test_build_values + local_offset); float* gpu_val = (float*)(test_build_values + local_offset);
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
auto* downpour_value = auto* downpour_value =
(paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr); (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
...@@ -1002,31 +979,15 @@ void PSGPUWrapper::EndPass() { ...@@ -1002,31 +979,15 @@ void PSGPUWrapper::EndPass() {
embed_g2sum_index()] = gpu_val->lr_g2sum; embed_g2sum_index()] = gpu_val->lr_g2sum;
cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue:: cpu_val[paddle::ps::DownpourCtrDymfAccessor::DownpourCtrDymfFeatureValue::
slot_index()] = gpu_val->slot; slot_index()] = gpu_val->slot;
#endif
#ifdef PADDLE_WITH_PSCORE
auto* downpour_value =
(paddle::distributed::FixedFeatureValue*)(gpu_val->cpu_ptr);
int downpour_value_size = downpour_value->size();
if (gpu_val->mf_size > 0 && downpour_value_size == 8) {
downpour_value->resize(gpu_val->mf_dim + 1 + downpour_value_size);
}
float* cpu_val = downpour_value->data();
paddle::distributed::CtrDymfAccessor accessor;
cpu_val[accessor.common_feature_value.DeltaScoreIndex()] =
gpu_val->delta_score;
cpu_val[accessor.common_feature_value.ShowIndex()] = gpu_val->show;
cpu_val[accessor.common_feature_value.ClickIndex()] = gpu_val->clk;
cpu_val[accessor.common_feature_value.EmbedWIndex()] = gpu_val->lr;
cpu_val[accessor.common_feature_value.EmbedG2SumIndex()] =
gpu_val->lr_g2sum;
cpu_val[accessor.common_feature_value.SlotIndex()] = gpu_val->slot;
#endif
if (gpu_val->mf_size > 0) { if (gpu_val->mf_size > 0) {
for (int x = 0; x < gpu_val->mf_dim + 1; x++) { for (int x = 0; x < gpu_val->mf_dim + 1; x++) {
cpu_val[x + 8] = gpu_val->mf[x]; cpu_val[x + 8] = gpu_val->mf[x];
} }
} }
#endif
#ifdef PADDLE_WITH_PSCORE
accessor_wrapper_ptr->DumpFill(gpu_val, cpu_table_accessor_, mf_dim);
#endif
} }
free(test_build_values); free(test_build_values);
}; };
...@@ -1066,79 +1027,8 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, ...@@ -1066,79 +1027,8 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
const std::vector<float*>& values, const std::vector<float*>& values,
const std::vector<int64_t>& slot_lengths, const std::vector<int64_t>& slot_lengths,
const int hidden_size) { const int hidden_size) {
platform::Timer all_timer; VLOG(0) << "Warning:: recommand use pull_gpups_sparse op instead. This "
platform::Timer pull_gpups_timer; "PullSparse is not used.";
all_timer.Start();
int64_t total_length =
std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
VLOG(3) << "Begine Gpu/Xpu Ps PullSparse";
auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue));
FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
if (platform::is_cpu_place(place)) {
PADDLE_THROW(platform::errors::Unimplemented(
"Warning:: CPUPlace is not supported in GpuPs now."));
} else if (platform::is_gpu_place(place)) {
#ifdef PADDLE_WITH_CUDA
VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
int device_id = place.GetDeviceId();
int devid_2_index = HeterPs_->get_index_by_devid(device_id);
LoDTensor& total_keys_tensor = keys_tensor[devid_2_index];
uint64_t* total_keys = reinterpret_cast<uint64_t*>(
total_keys_tensor.mutable_data<int64_t>({total_length, 1}, place));
// construct slot_level lod info
auto slot_lengths_lod = slot_lengths;
for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
slot_lengths_lod[i] += slot_lengths_lod[i - 1];
}
auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
auto buf_length =
memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
uint64_t** gpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
cudaMemcpy(gpu_keys,
keys.data(),
keys.size() * sizeof(uint64_t*),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_len,
slot_lengths_lod.data(),
slot_lengths.size() * sizeof(int64_t),
cudaMemcpyHostToDevice);
this->CopyKeys(place,
gpu_keys,
total_keys,
gpu_len,
static_cast<int>(slot_lengths.size()),
static_cast<int>(total_length));
VLOG(3) << "Begin call PullSparseGPU in GPUPS, dev: " << devid_2_index
<< " len: " << total_length;
pull_gpups_timer.Start();
HeterPs_->pull_sparse(devid_2_index,
total_keys,
total_values_gpu,
static_cast<int>(total_length));
pull_gpups_timer.Pause();
VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
<< "]";
this->CopyForPull(place,
gpu_keys,
values,
total_values_gpu,
gpu_len,
static_cast<int>(slot_lengths.size()),
hidden_size,
total_length);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"GpuPs: PullSparse Only Support CUDAPlace Now."));
}
all_timer.Pause();
VLOG(3) << "GpuPs PullSparse total costs: " << all_timer.ElapsedSec()
<< " s, of which GPUPS costs: " << pull_gpups_timer.ElapsedSec()
<< " s";
VLOG(3) << "End PullSparse";
} }
void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
...@@ -1156,13 +1046,16 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, ...@@ -1156,13 +1046,16 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
size_t feature_value_size = 0; size_t feature_value_size = 0;
feature_value_size = TYPEALIGN( auto accessor_wrapper_ptr =
8, sizeof(FeatureValue) + sizeof(float) * (index_dim_vec_.back() + 1)); GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
feature_value_size = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
VLOG(3) << "PullSparse max_dim:" << max_mf_dim_
<< " feature_value_size:" << feature_value_size;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
VLOG(3) << "Begine Gpu Ps PullSparse"; VLOG(3) << "Begine Gpu Ps PullSparse";
auto buf = memory::Alloc(place, total_length * feature_value_size); auto buf = memory::Alloc(place, total_length * feature_value_size);
FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr()); float* total_values_gpu = reinterpret_cast<float*>(buf->ptr());
#endif #endif
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
VLOG(3) << "Begine Xpu Ps PullSparse"; VLOG(3) << "Begine Xpu Ps PullSparse";
...@@ -1224,19 +1117,19 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, ...@@ -1224,19 +1117,19 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
<< "]"; << "]";
this->CopyForPull(place, accessor_wrapper_ptr->CopyForPull(place,
gpu_keys, gpu_keys,
values, values,
total_values_gpu, total_values_gpu,
gpu_len, gpu_len,
static_cast<int>(slot_lengths.size()), static_cast<int>(slot_lengths.size()),
hidden_size, hidden_size,
total_length, total_length,
gpu_dim); gpu_dim,
val_type_size_);
pull_gpups_timer.Pause(); pull_gpups_timer.Pause();
#endif
} else if (platform::is_xpu_place(place)) { } else if (platform::is_xpu_place(place)) {
#ifdef PADDLE_WITH_XPU_KP #ifdef PADDLE_WITH_XPU_KP
VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; VLOG(3) << "Begin copy keys, key_num[" << total_length << "]";
...@@ -1283,14 +1176,15 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, ...@@ -1283,14 +1176,15 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
<< "]"; << "]";
this->CopyForPull(place, accessor_wrapper_ptr->CopyForPull(place,
xpu_keys, xpu_keys,
values, values,
total_values_gpu, total_values_gpu,
xpu_len, xpu_len,
static_cast<int>(slot_lengths.size()), static_cast<int>(slot_lengths.size()),
hidden_size, hidden_size,
total_length); total_length,
val_type_size_);
#endif #endif
} else { } else {
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
...@@ -1317,12 +1211,13 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, ...@@ -1317,12 +1211,13 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
// #ifdef PADDLE_WITH_CUDA // #ifdef PADDLE_WITH_CUDA
VLOG(3) << "Begin GPUPS PushSparseGrad"; VLOG(3) << "Begin GPUPS PushSparseGrad";
size_t grad_value_size = auto accessor_wrapper_ptr =
TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
size_t grad_value_size = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
auto buf = memory::Alloc(place, total_length * grad_value_size); auto buf = memory::Alloc(place, total_length * grad_value_size);
VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_; VLOG(3) << "Push Sparse Max mf dimention: " << max_mf_dim_
FeaturePushValue* total_grad_values_gpu = << "grad_value_size:" << grad_value_size;
reinterpret_cast<FeaturePushValue*>(buf->ptr()); float* total_grad_values_gpu = reinterpret_cast<float*>(buf->ptr());
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Warning:: CPUPlace is not supported in GPUPS now.")); "Warning:: CPUPlace is not supported in GPUPS now."));
...@@ -1334,23 +1229,15 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, ...@@ -1334,23 +1229,15 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
uint64_t* total_keys = uint64_t* total_keys =
reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>()); reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
VLOG(3) << "Begin copy grad tensor to gpups struct"; VLOG(3) << "Begin copy grad tensor to gpups struct";
if (!multi_mf_dim_) { accessor_wrapper_ptr->CopyForPush(place,
this->CopyForPush(place, grad_values,
grad_values, total_grad_values_gpu,
total_grad_values_gpu, slot_lengths,
slot_lengths, total_length,
hidden_size, batch_size,
total_length, grad_value_size,
batch_size); slot_vector_,
} else { slot_mf_dim_vector_);
this->CopyForPush(place,
grad_values,
total_grad_values_gpu,
slot_lengths,
total_length,
batch_size,
grad_value_size);
}
VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index VLOG(3) << "Begin call PushSparseGPU in GPUPS, dev: " << devid_2_index
<< " len: " << total_length; << " len: " << total_length;
...@@ -1369,13 +1256,14 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place, ...@@ -1369,13 +1256,14 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
uint64_t* total_keys = uint64_t* total_keys =
reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>()); reinterpret_cast<uint64_t*>(cached_total_keys_tensor.data<int64_t>());
VLOG(3) << "Begin copy grad tensor to xpups struct"; VLOG(3) << "Begin copy grad tensor to xpups struct";
this->CopyForPush(place, accessor_wrapper_ptr->CopyForPush(place,
grad_values, grad_values,
total_grad_values_gpu, total_grad_values_gpu,
slot_lengths, slot_lengths,
hidden_size, hidden_size,
total_length, total_length,
batch_size); batch_size,
slot_vector_);
VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index VLOG(3) << "Begin call PushSparseXPU in XPUPS, dev: " << devid_2_index
<< " len: " << total_length; << " len: " << total_length;
......
...@@ -26,90 +26,6 @@ limitations under the License. */ ...@@ -26,90 +26,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
__global__ void PullCopy(float** dest,
const FeatureValue* src,
const int64_t* len,
int hidden,
int slot_num,
int total_len,
uint64_t** keys) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < len[mid])
high = mid;
else
low = mid + 1;
}
int x = low;
int y = i - (x ? len[x - 1] : 0);
if (*(keys[x] + y) == 0) {
*(dest[x] + y * hidden) = 0;
*(dest[x] + y * hidden + 1) = 0;
*(dest[x] + y * hidden + 2) = 0;
} else {
*(dest[x] + y * hidden) = (src + i)->show;
*(dest[x] + y * hidden + 1) = (src + i)->clk;
*(dest[x] + y * hidden + 2) = (src + i)->lr;
}
if ((src + i)->mf_size == 0 || *(keys[x] + y) == 0) {
for (int j = 0; j < hidden - 3; j++) {
*(dest[x] + y * hidden + 3 + j) = 0;
}
} else {
for (int j = 0; j < hidden - 3; j++) {
*(dest[x] + y * hidden + 3 + j) = (src + i)->mf[1 + j];
}
}
}
}
__global__ void PullCopy(float** dest,
const FeatureValue* src,
const int64_t* len,
int slot_num,
int total_len,
uint64_t** keys,
uint64_t max_val_size,
int* gpu_dim) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < len[mid])
high = mid;
else
low = mid + 1;
}
int x = low;
int y = i - (x ? len[x - 1] : 0);
FeatureValue* feature_value_ptr =
(FeatureValue*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
int mf_dim = gpu_dim[x] - 3;
if (*(keys[x] + y) == 0) {
*(dest[x] + y * (mf_dim + 3)) = 0;
*(dest[x] + y * (mf_dim + 3) + 1) = 0;
*(dest[x] + y * (mf_dim + 3) + 2) = 0;
} else {
*(dest[x] + y * (mf_dim + 3)) = feature_value_ptr->show;
*(dest[x] + y * (mf_dim + 3) + 1) = feature_value_ptr->clk;
*(dest[x] + y * (mf_dim + 3) + 2) = feature_value_ptr->lr;
}
if ((feature_value_ptr)->mf_size == 0 || *(keys[x] + y) == 0) {
for (int j = 0; j < mf_dim; j++) {
*(dest[x] + y * (mf_dim + 3) + 3 + j) = 0;
}
} else {
for (int j = 0; j < mf_dim; j++) {
*(dest[x] + y * (mf_dim + 3) + 3 + j) = feature_value_ptr->mf[1 + j];
}
}
}
}
__global__ void CopyKeysKernel(uint64_t** src_keys, __global__ void CopyKeysKernel(uint64_t** src_keys,
uint64_t* dest_total_keys, uint64_t* dest_total_keys,
const int64_t* len, const int64_t* len,
...@@ -161,101 +77,8 @@ __global__ void PushCopy(FeaturePushValue* dest, ...@@ -161,101 +77,8 @@ __global__ void PushCopy(FeaturePushValue* dest,
} }
} }
__global__ void PushCopyWithPool(FeaturePushValue* dest,
float** src,
int64_t* len,
int slot_num,
uint64_t total_len,
int bs,
int* slot_vector,
int* mf_dim_vector,
size_t grad_value_size) {
CUDA_KERNEL_LOOP(i, total_len) {
int low = 0;
int high = slot_num - 1;
while (low < high) {
int mid = (low + high) / 2;
if (i < len[mid])
high = mid;
else
low = mid + 1;
}
int x = low;
int y = i - (x ? len[low - 1] : 0);
FeaturePushValue* cur =
(FeaturePushValue*)((char*)dest + i * grad_value_size);
cur->slot = slot_vector[x];
int mf_dim = mf_dim_vector[x];
cur->mf_dim = mf_dim;
cur->show = *(src[x] + y * (mf_dim + 3));
cur->clk = *(src[x] + y * (mf_dim + 3) + 1);
cur->lr_g = *(src[x] + y * (mf_dim + 3) + 2) * -1. * bs;
for (int j = 0; j < cur->mf_dim; j++) {
cur->mf_g[j] = *(src[x] + y * (mf_dim + 3) + 3 + j) * -1. * bs;
}
}
}
PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const FeatureValue* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
cudaMemcpy(gpu_values,
values.data(),
values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
gpu_values,
total_values_gpu,
gpu_len,
hidden_size,
slot_num,
total_length,
gpu_keys);
cudaStreamSynchronize(stream);
}
void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const FeatureValue* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length,
int* gpu_dim) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto buf_value = memory::Alloc(place, values.size() * sizeof(float*));
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
cudaMemcpy(gpu_values,
values.data(),
values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
PullCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
gpu_values,
total_values_gpu,
gpu_len,
slot_num,
total_length,
gpu_keys,
val_type_size_,
gpu_dim);
cudaStreamSynchronize(stream);
}
void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
uint64_t** origin_keys, uint64_t** origin_keys,
uint64_t* total_keys, uint64_t* total_keys,
...@@ -270,125 +93,26 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, ...@@ -270,125 +93,26 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
cudaStreamSynchronize(stream); cudaStreamSynchronize(stream);
} }
void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
FeaturePushValue* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const int hidden_size,
const int64_t total_length,
const int batch_size) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto slot_lengths_lod = slot_lengths;
for (int i = 1; i < slot_lengths_lod.size(); i++) {
slot_lengths_lod[i] += slot_lengths_lod[i - 1];
}
auto buf_grad_value =
memory::Alloc(place, grad_values.size() * sizeof(float*));
auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
auto buf_slot_vector =
memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
cudaMemcpy(gpu_values,
grad_values.data(),
grad_values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_len,
slot_lengths_lod.data(),
slot_lengths.size() * sizeof(int64_t),
cudaMemcpyHostToDevice);
cudaMemcpy(d_slot_vector,
slot_vector_.data(),
slot_lengths_lod.size() * sizeof(int),
cudaMemcpyHostToDevice);
PushCopy<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
total_grad_values_gpu,
gpu_values,
gpu_len,
hidden_size,
slot_lengths.size(),
total_length,
batch_size,
d_slot_vector);
cudaStreamSynchronize(stream);
}
void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
FeaturePushValue* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const uint64_t total_length,
const int batch_size,
size_t grad_value_size) {
auto stream = dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
auto slot_lengths_lod = slot_lengths;
for (int i = 1; i < slot_lengths_lod.size(); i++) {
slot_lengths_lod[i] += slot_lengths_lod[i - 1];
}
auto buf_grad_value =
memory::Alloc(place, grad_values.size() * sizeof(float*));
auto buf_length = memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
auto buf_slot_vector =
memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
auto buf_mf_dim_vector =
memory::Alloc(place, slot_lengths_lod.size() * sizeof(int));
float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
int* d_mf_dim_vector = reinterpret_cast<int*>(buf_mf_dim_vector->ptr());
cudaMemcpy(gpu_values,
grad_values.data(),
grad_values.size() * sizeof(float*),
cudaMemcpyHostToDevice);
cudaMemcpy(gpu_len,
slot_lengths_lod.data(),
slot_lengths.size() * sizeof(int64_t),
cudaMemcpyHostToDevice);
cudaMemcpy(d_slot_vector,
slot_vector_.data(),
slot_lengths_lod.size() * sizeof(int),
cudaMemcpyHostToDevice);
cudaMemcpy(d_mf_dim_vector,
slot_mf_dim_vector_.data(),
slot_lengths_lod.size() * sizeof(int),
cudaMemcpyHostToDevice);
PushCopyWithPool<<<(total_length + 1024 - 1) / 1024, 1024, 0, stream>>>(
total_grad_values_gpu,
gpu_values,
gpu_len,
slot_lengths.size(),
total_length,
batch_size,
d_slot_vector,
d_mf_dim_vector,
grad_value_size);
cudaStreamSynchronize(stream);
}
void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, void PSGPUWrapper::SetSparseSGD(float nonclk_coeff,
float clk_coeff, float clk_coeff,
float min_bound, float min_bound,
float max_bound, float max_bound,
float learning_rate, float learning_rate,
float initial_g2sum, float initial_g2sum,
float initial_range) { float initial_range,
OptimizerConfig optimizer_config; float beta1_decay_rate,
optimizer_config.set_sparse_sgd(nonclk_coeff, float beta2_decay_rate,
clk_coeff, float ada_epsilon) {
min_bound, optimizer_config_.set_sparse_sgd(nonclk_coeff,
max_bound, clk_coeff,
learning_rate, min_bound,
initial_g2sum, max_bound,
initial_range); learning_rate,
HeterPs_->set_sparse_sgd(optimizer_config); initial_g2sum,
initial_range,
beta1_decay_rate,
beta2_decay_rate,
ada_epsilon);
} }
void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
...@@ -396,15 +120,19 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, ...@@ -396,15 +120,19 @@ void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float mf_initial_g2sum, float mf_initial_g2sum,
float mf_initial_range, float mf_initial_range,
float mf_min_bound, float mf_min_bound,
float mf_max_bound) { float mf_max_bound,
OptimizerConfig optimizer_config; float mf_beta1_decay_rate,
optimizer_config.set_embedx_sgd(mf_create_thresholds, float mf_beta2_decay_rate,
mf_learning_rate, float mf_ada_epsilon) {
mf_initial_g2sum, optimizer_config_.set_embedx_sgd(mf_create_thresholds,
mf_initial_range, mf_learning_rate,
mf_min_bound, mf_initial_g2sum,
mf_max_bound); mf_initial_range,
HeterPs_->set_embedx_sgd(optimizer_config); mf_min_bound,
mf_max_bound,
mf_beta1_decay_rate,
mf_beta2_decay_rate,
mf_ada_epsilon);
} }
} // end namespace framework } // end namespace framework
......
...@@ -51,7 +51,10 @@ limitations under the License. */ ...@@ -51,7 +51,10 @@ limitations under the License. */
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_PSCORE #ifdef PADDLE_WITH_PSCORE
#include "paddle/fluid/distributed/ps/table/accessor.h"
#include "paddle/fluid/distributed/ps/table/ctr_dymf_accessor.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h" #include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/distributed/the_one_ps.pb.h"
#endif #endif
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
#include "afs_api.h" #include "afs_api.h"
...@@ -64,9 +67,6 @@ limitations under the License. */ ...@@ -64,9 +67,6 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
#define TYPEALIGN(ALIGNVAL, LEN) \
(((uint64_t)(LEN) + ((ALIGNVAL)-1)) & ~((uint64_t)((ALIGNVAL)-1)))
class Dataset; class Dataset;
#ifdef PADDLE_WITH_PSLIB #ifdef PADDLE_WITH_PSLIB
...@@ -98,7 +98,7 @@ class AfsWrapper { ...@@ -98,7 +98,7 @@ class AfsWrapper {
class PSGPUWrapper { class PSGPUWrapper {
public: public:
virtual ~PSGPUWrapper(); ~PSGPUWrapper();
PSGPUWrapper() { PSGPUWrapper() {
HeterPs_ = NULL; HeterPs_ = NULL;
...@@ -139,37 +139,6 @@ class PSGPUWrapper { ...@@ -139,37 +139,6 @@ class PSGPUWrapper {
const int64_t* gpu_len, const int64_t* gpu_len,
int slot_num, int slot_num,
int total_len); int total_len);
void CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const FeatureValue* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length);
void CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const FeatureValue* total_values_gpu,
const int64_t* gpu_len,
const int slot_num,
const int hidden_size,
const int64_t total_length,
int* gpu_dim);
void CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
FeaturePushValue* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const int hidden_size,
const int64_t total_length,
const int batch_size);
void CopyForPush(const paddle::platform::Place& place,
const std::vector<const float*>& grad_values,
FeaturePushValue* total_grad_values_gpu,
const std::vector<int64_t>& slot_lengths,
const uint64_t total_length,
const int batch_size,
size_t grad_value_size);
void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task); void BuildGPUTask(std::shared_ptr<HeterContext> gpu_task);
void PreBuildTask(std::shared_ptr<HeterContext> gpu_task); void PreBuildTask(std::shared_ptr<HeterContext> gpu_task);
...@@ -274,13 +243,96 @@ class PSGPUWrapper { ...@@ -274,13 +243,96 @@ class PSGPUWrapper {
float max_bound, float max_bound,
float learning_rate, float learning_rate,
float initial_g2sum, float initial_g2sum,
float initial_range); float initial_range,
float beta1_decay_rate,
float beta2_decay_rate,
float ada_epsilon);
void SetEmbedxSGD(float mf_create_thresholds, void SetEmbedxSGD(float mf_create_thresholds,
float mf_learning_rate, float mf_learning_rate,
float mf_initial_g2sum, float mf_initial_g2sum,
float mf_initial_range, float mf_initial_range,
float mf_min_bound, float mf_min_bound,
float mf_max_bound); float mf_max_bound,
float mf_beta1_decay_rate,
float mf_beta2_decay_rate,
float mf_ada_epsilon);
#ifdef PADDLE_WITH_PSCORE
void add_sparse_optimizer(
std::unordered_map<std::string, float>& config, // NOLINT
const ::paddle::distributed::SparseCommonSGDRuleParameter& sgd_param,
const std::string& prefix = "") {
auto optimizer_name = sgd_param.name();
if (optimizer_name == "SparseNaiveSGDRule") {
config[prefix + "optimizer_type"] = 0;
config[prefix + "learning_rate"] = sgd_param.naive().learning_rate();
config[prefix + "initial_range"] = sgd_param.naive().initial_range();
config[prefix + "min_bound"] = sgd_param.naive().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.naive().weight_bounds()[1];
} else if (optimizer_name == "SparseAdaGradSGDRule") {
config[prefix + "optimizer_type"] = 1;
config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
} else if (optimizer_name == "StdAdaGradSGDRule") {
config[prefix + "optimizer_type"] = 2;
config[prefix + "learning_rate"] = sgd_param.adagrad().learning_rate();
config[prefix + "initial_range"] = sgd_param.adagrad().initial_range();
config[prefix + "initial_g2sum"] = sgd_param.adagrad().initial_g2sum();
config[prefix + "min_bound"] = sgd_param.adagrad().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adagrad().weight_bounds()[1];
} else if (optimizer_name == "SparseAdamSGDRule") {
config[prefix + "optimizer_type"] = 3;
config[prefix + "learning_rate"] = sgd_param.adam().learning_rate();
config[prefix + "initial_range"] = sgd_param.adam().initial_range();
config[prefix + "beta1_decay_rate"] = sgd_param.adam().beta1_decay_rate();
config[prefix + "beta2_decay_rate"] = sgd_param.adam().beta2_decay_rate();
config[prefix + "ada_epsilon"] = sgd_param.adam().ada_epsilon();
config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1];
} else if (optimizer_name == "SparseSharedAdamSGDRule") {
config[prefix + "optimizer_type"] = 4;
config[prefix + "learning_rate"] = sgd_param.adam().learning_rate();
config[prefix + "initial_range"] = sgd_param.adam().initial_range();
config[prefix + "beta1_decay_rate"] = sgd_param.adam().beta1_decay_rate();
config[prefix + "beta2_decay_rate"] = sgd_param.adam().beta2_decay_rate();
config[prefix + "ada_epsilon"] = sgd_param.adam().ada_epsilon();
config[prefix + "min_bound"] = sgd_param.adam().weight_bounds()[0];
config[prefix + "max_bound"] = sgd_param.adam().weight_bounds()[1];
}
}
void InitializeGPUServer(paddle::distributed::PSParameter ps_param) {
auto sparse_table =
ps_param.server_param().downpour_server_param().downpour_table_param(0);
auto sparse_table_accessor = sparse_table.accessor();
auto sparse_table_accessor_parameter =
sparse_table_accessor.ctr_accessor_param();
accessor_class_ = sparse_table_accessor.accessor_class();
std::unordered_map<std::string, float> config;
config["embedx_dim"] = sparse_table_accessor.embedx_dim();
config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();
config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();
if (accessor_class_ == "CtrDymfAccessor") {
// optimizer config for embed_w and embedx
add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
add_sparse_optimizer(
config, sparse_table_accessor.embedx_sgd_param(), "mf_");
}
fleet_config_ = config;
GlobalAccessorTransfor::GetInstance().Init(accessor_class_);
GlobalAccessorTransfor::GetInstance().GetAccessorWrapper()->Configure(
config);
InitializeGPUServer(config);
}
#endif
void InitializeGPUServer(std::unordered_map<std::string, float> config) { void InitializeGPUServer(std::unordered_map<std::string, float> config) {
float nonclk_coeff = (config.find("nonclk_coeff") == config.end()) float nonclk_coeff = (config.find("nonclk_coeff") == config.end())
? 1.0 ? 1.0
...@@ -288,54 +340,83 @@ class PSGPUWrapper { ...@@ -288,54 +340,83 @@ class PSGPUWrapper {
float clk_coeff = float clk_coeff =
(config.find("clk_coeff") == config.end()) ? 1.0 : config["clk_coeff"]; (config.find("clk_coeff") == config.end()) ? 1.0 : config["clk_coeff"];
float min_bound = (config.find("min_bound") == config.end()) float min_bound = (config.find("min_bound") == config.end())
? -10000.0 ? -10.0
: config["min_bound"]; : config["min_bound"];
float max_bound = (config.find("max_bound") == config.end()) float max_bound =
? 10000.0 (config.find("max_bound") == config.end()) ? 10.0 : config["max_bound"];
: config["max_bound"];
float learning_rate = (config.find("learning_rate") == config.end()) float learning_rate = (config.find("learning_rate") == config.end())
? 1.0 ? 0.05
: config["learning_rate"]; : config["learning_rate"];
float initial_g2sum = (config.find("initial_g2sum") == config.end()) float initial_g2sum = (config.find("initial_g2sum") == config.end())
? 1.0 ? 3.0
: config["initial_g2sum"]; : config["initial_g2sum"];
float initial_range = (config.find("initial_range") == config.end()) float initial_range = (config.find("initial_range") == config.end())
? 1.0 ? 1e-4
: config["initial_range"]; : config["initial_range"];
float beta1_decay_rate = (config.find("beta1_decay_rate") == config.end())
? 0.9
: config["beta1_decay_rate"];
float beta2_decay_rate = (config.find("beta2_decay_rate") == config.end())
? 0.999
: config["beta2_decay_rate"];
float ada_epsilon = (config.find("ada_epsilon") == config.end())
? 1e-8
: config["ada_epsilon"];
// mf config settings // mf config settings
float mf_create_thresholds = float mf_create_thresholds =
(config.find("mf_create_thresholds") == config.end()) (config.find("mf_create_thresholds") == config.end())
? static_cast<float>(1.0) ? static_cast<float>(1.0)
: config["mf_create_thresholds"]; : config["mf_create_thresholds"];
float mf_learning_rate = (config.find("mf_learning_rate") == config.end()) float mf_learning_rate = (config.find("mf_learning_rate") == config.end())
? 1.0 ? 0.05
: config["mf_learning_rate"]; : config["mf_learning_rate"];
float mf_initial_g2sum = (config.find("mf_initial_g2sum") == config.end()) float mf_initial_g2sum = (config.find("mf_initial_g2sum") == config.end())
? 1.0 ? 3.0
: config["mf_initial_g2sum"]; : config["mf_initial_g2sum"];
float mf_initial_range = (config.find("mf_initial_range") == config.end()) float mf_initial_range = (config.find("mf_initial_range") == config.end())
? 1.0 ? 1e-4
: config["mf_initial_range"]; : config["mf_initial_range"];
float mf_min_bound = (config.find("mf_min_bound") == config.end()) float mf_min_bound = (config.find("mf_min_bound") == config.end())
? 1.0 ? -10.0
: config["mf_min_bound"]; : config["mf_min_bound"];
float mf_max_bound = (config.find("mf_max_bound") == config.end()) float mf_max_bound = (config.find("mf_max_bound") == config.end())
? 1.0 ? 10.0
: config["mf_max_bound"]; : config["mf_max_bound"];
float mf_beta1_decay_rate =
(config.find("mf_beta1_decay_rate") == config.end())
? 0.9
: config["mf_beta1_decay_rate"];
float mf_beta2_decay_rate =
(config.find("mf_beta2_decay_rate") == config.end())
? 0.999
: config["mf_beta2_decay_rate"];
float mf_ada_epsilon = (config.find("mf_ada_epsilon") == config.end())
? 1e-8
: config["mf_ada_epsilon"];
this->SetSparseSGD(nonclk_coeff, this->SetSparseSGD(nonclk_coeff,
clk_coeff, clk_coeff,
min_bound, min_bound,
max_bound, max_bound,
learning_rate, learning_rate,
initial_g2sum, initial_g2sum,
initial_range); initial_range,
beta1_decay_rate,
beta2_decay_rate,
ada_epsilon);
this->SetEmbedxSGD(mf_create_thresholds, this->SetEmbedxSGD(mf_create_thresholds,
mf_learning_rate, mf_learning_rate,
mf_initial_g2sum, mf_initial_g2sum,
mf_initial_range, mf_initial_range,
mf_min_bound, mf_min_bound,
mf_max_bound); mf_max_bound,
mf_beta1_decay_rate,
mf_beta2_decay_rate,
mf_ada_epsilon);
// set optimizer type(naive,adagrad,std_adagrad,adam,share_adam)
optimizer_type_ = (config.find("optimizer_type") == config.end())
? 1
: static_cast<int>(config["optimizer_type"]);
} }
void SetDate(int year, int month, int day) { void SetDate(int year, int month, int day) {
...@@ -380,7 +461,7 @@ class PSGPUWrapper { ...@@ -380,7 +461,7 @@ class PSGPUWrapper {
if (slot_info_initialized_) { if (slot_info_initialized_) {
return; return;
} }
SlotRecordDataset* dataset = dynamic_cast<SlotRecordDataset*>(dataset_); SlotRecordDataset* dataset = (SlotRecordDataset*)(dataset_);
auto slots_vec = dataset->GetSlots(); auto slots_vec = dataset->GetSlots();
slot_offset_vector_.clear(); slot_offset_vector_.clear();
for (auto& slot : slot_vector_) { for (auto& slot : slot_vector_) {
...@@ -421,10 +502,13 @@ class PSGPUWrapper { ...@@ -421,10 +502,13 @@ class PSGPUWrapper {
for (size_t i = 0; i < slot_index_vec_.size(); i++) { for (size_t i = 0; i < slot_index_vec_.size(); i++) {
slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]]; slot_index_vec_[i] = dim_index_map[slot_mf_dim_vector_[i]];
} }
val_type_size_ =
TYPEALIGN(8, sizeof(FeatureValue) + sizeof(float) * (max_mf_dim_ + 1)); auto accessor_wrapper_ptr =
grad_type_size_ = GlobalAccessorTransfor::GetInstance().GetAccessorWrapper();
TYPEALIGN(8, sizeof(FeaturePushValue) + (max_mf_dim_ * sizeof(float))); val_type_size_ = accessor_wrapper_ptr->GetFeatureValueSize(max_mf_dim_);
grad_type_size_ = accessor_wrapper_ptr->GetPushValueSize(max_mf_dim_);
VLOG(0) << "InitSlotInfo: val_type_size_" << val_type_size_
<< " grad_type_size_:" << grad_type_size_;
slot_info_initialized_ = true; slot_info_initialized_ = true;
} }
#endif #endif
...@@ -445,6 +529,12 @@ class PSGPUWrapper { ...@@ -445,6 +529,12 @@ class PSGPUWrapper {
const std::string& conf); const std::string& conf);
#endif #endif
#ifdef PADDLE_WITH_PSCORE
void SetTableAccessor(paddle::distributed::ValueAccessor* accessor) {
cpu_table_accessor_ = accessor;
}
#endif
private: private:
static std::shared_ptr<PSGPUWrapper> s_instance_; static std::shared_ptr<PSGPUWrapper> s_instance_;
Dataset* dataset_; Dataset* dataset_;
...@@ -497,6 +587,12 @@ class PSGPUWrapper { ...@@ -497,6 +587,12 @@ class PSGPUWrapper {
int day_; int day_;
bool slot_info_initialized_ = false; bool slot_info_initialized_ = false;
int use_afs_api_ = 0; int use_afs_api_ = 0;
int optimizer_type_ = 1;
std::string accessor_class_;
std::unordered_map<std::string, float> fleet_config_;
#ifdef PADDLE_WITH_PSCORE
paddle::distributed::ValueAccessor* cpu_table_accessor_;
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
std::vector<MemoryPool*> mem_pools_; std::vector<MemoryPool*> mem_pools_;
...@@ -521,6 +617,7 @@ class PSGPUWrapper { ...@@ -521,6 +617,7 @@ class PSGPUWrapper {
bool running_ = false; bool running_ = false;
std::vector<std::shared_ptr<ThreadPool>> pull_thread_pool_; std::vector<std::shared_ptr<ThreadPool>> pull_thread_pool_;
std::vector<std::shared_ptr<ThreadPool>> hbm_thread_pool_; std::vector<std::shared_ptr<ThreadPool>> hbm_thread_pool_;
OptimizerConfig optimizer_config_;
protected: protected:
static bool is_initialized_; static bool is_initialized_;
......
...@@ -28,9 +28,13 @@ limitations under the License. */ ...@@ -28,9 +28,13 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
__global__ void PullCopy(float* dest, const FeatureValue* src, __global__ void PullCopy(float* dest,
const long long* len, int hidden, int slot_num, const FeatureValue* src,
int total_len, unsigned long long* keys) { const long long* len,
int hidden,
int slot_num,
int total_len,
unsigned long long* keys) {
int cid = core_id(); int cid = core_id();
int ncores = core_num(); int ncores = core_num();
if (cid >= ncores) { if (cid >= ncores) {
...@@ -42,8 +46,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, ...@@ -42,8 +46,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
GM2LM(len, local_len, slot_num * sizeof(int64_t)); GM2LM(len, local_len, slot_num * sizeof(int64_t));
__global_ptr__ unsigned long long* local_keys[slot_num]; __global_ptr__ unsigned long long* local_keys[slot_num];
GM2LM(keys, local_keys, GM2LM(
slot_num * sizeof(__global_ptr__ unsigned long long*)); keys, local_keys, slot_num * sizeof(__global_ptr__ unsigned long long*));
__global_ptr__ float* local_dest[slot_num]; __global_ptr__ float* local_dest[slot_num];
GM2LM(dest, local_dest, slot_num * sizeof(__global_ptr__ float*)); GM2LM(dest, local_dest, slot_num * sizeof(__global_ptr__ float*));
...@@ -64,10 +68,11 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, ...@@ -64,10 +68,11 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
// copy read_len (length) of slots' val to LM // copy read_len (length) of slots' val to LM
for (int k = 0; k < slot_len; k += read_len) { for (int k = 0; k < slot_len; k += read_len) {
int real_read_len = min(read_len, slot_len - k); int real_read_len = min(read_len, slot_len - k);
GM2LM(src + dest_len + k, local_slot_vals, GM2LM(src + dest_len + k,
local_slot_vals,
real_read_len * sizeof(FeatureValue)); real_read_len * sizeof(FeatureValue));
GM2LM(local_keys[i] + k, local_slot_keys, GM2LM(
real_read_len * sizeof(uint64_t)); local_keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t));
for (int j = 0; j < real_read_len; j++) { for (int j = 0; j < real_read_len; j++) {
if (local_slot_keys[j] == 0) { if (local_slot_keys[j] == 0) {
local_dest_vals[j * hidden] = 0; local_dest_vals[j * hidden] = 0;
...@@ -89,7 +94,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, ...@@ -89,7 +94,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
} }
} }
} }
LM2GM(local_dest_vals, local_dest[i] + k * hidden, LM2GM(local_dest_vals,
local_dest[i] + k * hidden,
real_read_len * hidden * sizeof(float)); real_read_len * hidden * sizeof(float));
} }
} }
...@@ -97,7 +103,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src, ...@@ -97,7 +103,8 @@ __global__ void PullCopy(float* dest, const FeatureValue* src,
__global__ void CopyKeysKernel(unsigned long long* src_keys, __global__ void CopyKeysKernel(unsigned long long* src_keys,
unsigned long long* dest_total_keys, unsigned long long* dest_total_keys,
const long long* len, int slot_num, const long long* len,
int slot_num,
int total_len) { int total_len) {
int cid = core_id(); int cid = core_id();
int ncores = core_num(); int ncores = core_num();
...@@ -110,7 +117,8 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys, ...@@ -110,7 +117,8 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys,
GM2LM(len, local_len, slot_num * sizeof(long long)); GM2LM(len, local_len, slot_num * sizeof(long long));
__global_ptr__ unsigned long long* local_keys[slot_num]; __global_ptr__ unsigned long long* local_keys[slot_num];
GM2LM(src_keys, local_keys, GM2LM(src_keys,
local_keys,
slot_num * sizeof(__global_ptr__ unsigned long long*)); slot_num * sizeof(__global_ptr__ unsigned long long*));
for (int i = thread_id; i < slot_num; i += nthreads) { for (int i = thread_id; i < slot_num; i += nthreads) {
...@@ -123,16 +131,23 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys, ...@@ -123,16 +131,23 @@ __global__ void CopyKeysKernel(unsigned long long* src_keys,
for (int k = 0; k < slot_len; k += read_len) { for (int k = 0; k < slot_len; k += read_len) {
int real_read_len = min(read_len, slot_len - k); int real_read_len = min(read_len, slot_len - k);
GM2LM(local_keys[i] + k, local_slot_keys, GM2LM(local_keys[i] + k,
local_slot_keys,
real_read_len * sizeof(unsigned long long)); real_read_len * sizeof(unsigned long long));
LM2GM(local_slot_keys, dest_total_keys + dest_len + k, LM2GM(local_slot_keys,
dest_total_keys + dest_len + k,
real_read_len * sizeof(unsigned long long)); real_read_len * sizeof(unsigned long long));
} }
} }
} }
__global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, __global__ void PushCopy(FeaturePushValue* dest,
int hidden, int slot_num, int total_len, int bs, float* src,
long long* len,
int hidden,
int slot_num,
int total_len,
int bs,
int* slot_vector) { int* slot_vector) {
int cid = core_id(); int cid = core_id();
int ncores = core_num(); int ncores = core_num();
...@@ -163,7 +178,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, ...@@ -163,7 +178,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
// copy read_len(length) of slots' grad to LM // copy read_len(length) of slots' grad to LM
for (int k = 0; k < slot_len; k += read_len) { for (int k = 0; k < slot_len; k += read_len) {
int real_read_len = min(read_len, slot_len - k); int real_read_len = min(read_len, slot_len - k);
GM2LM(local_src[i] + k * hidden, local_slot_grads, GM2LM(local_src[i] + k * hidden,
local_slot_grads,
real_read_len * hidden * sizeof(float)); real_read_len * hidden * sizeof(float));
// copy from slots' grad to total grad // copy from slots' grad to total grad
for (int j = 0; j < real_read_len; j++) { for (int j = 0; j < real_read_len; j++) {
...@@ -176,7 +192,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, ...@@ -176,7 +192,8 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
local_slot_grads[j * hidden + 3 + m] * -1. * bs; local_slot_grads[j * hidden + 3 + m] * -1. * bs;
} }
} }
LM2GM(local_dest_grads, dest + dest_len + k, LM2GM(local_dest_grads,
dest + dest_len + k,
real_read_len * sizeof(FeaturePushValue)); real_read_len * sizeof(FeaturePushValue));
} }
} }
...@@ -184,40 +201,11 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len, ...@@ -184,40 +201,11 @@ __global__ void PushCopy(FeaturePushValue* dest, float* src, long long* len,
PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; } PSGPUWrapper::~PSGPUWrapper() { delete HeterPs_; }
void PSGPUWrapper::CopyForPull(const paddle::platform::Place& place,
uint64_t** gpu_keys,
const std::vector<float*>& values,
const FeatureValue* total_values_gpu,
const int64_t* gpu_len, const int slot_num,
const int hidden_size,
const int64_t total_length) {
XPUStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
->x_context()
->xpu_stream;
// float* buf_value = nullptr;
// xpu_malloc(reinterpret_cast<void**>(&buf_value),
// values.size() * sizeof(float*));
// float** gpu_values = reinterpret_cast<float**>(&buf_value);
float* gpu_values = nullptr;
xpu_malloc(reinterpret_cast<void**>(&gpu_values),
values.size() * sizeof(float*));
xpu_memcpy(gpu_values, values.data(), values.size() * sizeof(float*),
XPU_HOST_TO_DEVICE);
// unsigned long long** c_keys = (unsigned long long**)gpu_keys;
unsigned long long* c_keys = reinterpret_cast<unsigned long long*>(gpu_keys);
const long long* c_len = (const long long*)gpu_len;
PullCopy<<<2, 64, stream>>>(gpu_values, total_values_gpu, c_len, hidden_size,
slot_num, total_length, c_keys);
xpu_wait(stream);
}
void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
uint64_t** origin_keys, uint64_t* total_keys, uint64_t** origin_keys,
const int64_t* gpu_len, int slot_num, uint64_t* total_keys,
const int64_t* gpu_len,
int slot_num,
int total_len) { int total_len) {
XPUStream stream = nullptr; XPUStream stream = nullptr;
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
...@@ -232,66 +220,49 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place, ...@@ -232,66 +220,49 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
xpu_wait(stream); xpu_wait(stream);
} }
void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place, void PSGPUWrapper::SetSparseSGD(float nonclk_coeff,
const std::vector<const float*>& grad_values, float clk_coeff,
FeaturePushValue* total_grad_values_gpu, float min_bound,
const std::vector<int64_t>& slot_lengths, float max_bound,
const int hidden_size, float learning_rate,
const int64_t total_length, float initial_g2sum,
const int batch_size) { float initial_range,
XPUStream stream = nullptr; float beta1_decay_rate,
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); float beta2_decay_rate,
stream = static_cast<platform::XPUDeviceContext*>(dev_ctx) float ada_epsilon) {
->x_context()
->xpu_stream;
auto slot_lengths_lod = slot_lengths;
for (size_t i = 1; i < slot_lengths_lod.size(); i++) {
slot_lengths_lod[i] += slot_lengths_lod[i - 1];
}
float* gpu_values = nullptr;
int64_t* gpu_len = nullptr;
int* d_slot_vector = nullptr;
xpu_malloc(reinterpret_cast<void**>(&gpu_values),
grad_values.size() * sizeof(float*));
xpu_malloc(reinterpret_cast<void**>(&gpu_len),
slot_lengths.size() * sizeof(int64_t));
xpu_malloc(reinterpret_cast<void**>(&d_slot_vector),
slot_lengths_lod.size() * sizeof(int));
xpu_memcpy(gpu_values, grad_values.data(),
grad_values.size() * sizeof(float*), XPU_HOST_TO_DEVICE);
xpu_memcpy(gpu_len, slot_lengths_lod.data(),
slot_lengths.size() * sizeof(int64_t), XPU_HOST_TO_DEVICE);
xpu_memcpy(d_slot_vector, slot_vector_.data(),
slot_lengths_lod.size() * sizeof(int), XPU_HOST_TO_DEVICE);
long long* c_len = (long long*)gpu_len;
PushCopy<<<2, 64, stream>>>(total_grad_values_gpu, gpu_values, c_len,
hidden_size, slot_lengths.size(), total_length,
batch_size, d_slot_vector);
xpu_wait(stream);
}
void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
float min_bound, float max_bound,
float learning_rate, float initial_g2sum,
float initial_range) {
OptimizerConfig optimizer_config; OptimizerConfig optimizer_config;
optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound, optimizer_config.set_sparse_sgd(nonclk_coeff,
learning_rate, initial_g2sum, initial_range); clk_coeff,
min_bound,
max_bound,
learning_rate,
initial_g2sum,
initial_range,
beta1_decay_rate,
beta2_decay_rate,
ada_epsilon);
HeterPs_->set_sparse_sgd(optimizer_config); HeterPs_->set_sparse_sgd(optimizer_config);
} }
void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds, void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float mf_learning_rate, float mf_initial_g2sum, float mf_learning_rate,
float mf_initial_range, float mf_min_bound, float mf_initial_g2sum,
float mf_max_bound) { float mf_initial_range,
float mf_min_bound,
float mf_max_bound,
float mf_beta1_decay_rate,
float mf_beta2_decay_rate,
float mf_ada_epsilon) {
OptimizerConfig optimizer_config; OptimizerConfig optimizer_config;
optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate, optimizer_config.set_embedx_sgd(mf_create_thresholds,
mf_initial_g2sum, mf_initial_range, mf_learning_rate,
mf_min_bound, mf_max_bound); mf_initial_g2sum,
mf_initial_range,
mf_min_bound,
mf_max_bound,
mf_beta1_decay_rate,
mf_beta2_decay_rate,
mf_ada_epsilon);
HeterPs_->set_embedx_sgd(optimizer_config); HeterPs_->set_embedx_sgd(optimizer_config);
} }
......
...@@ -594,6 +594,21 @@ class DistributedStrategy(object): ...@@ -594,6 +594,21 @@ class DistributedStrategy(object):
bounds = strategy.get(prefix + 'sparse_weight_bounds', bounds = strategy.get(prefix + 'sparse_weight_bounds',
[-10, 10]) [-10, 10])
sgd.adam.weight_bounds.extend(bounds) sgd.adam.weight_bounds.extend(bounds)
elif optimizer_name == "shared_adam":
sgd.name = 'SparseSharedAdamSGDRule'
sgd.adam.learning_rate = strategy.get(
prefix + 'sparse_learning_rate', 0.001)
sgd.adam.initial_range = strategy.get(
prefix + 'sparse_initial_range', 1e-4)
sgd.adam.beta1_decay_rate = strategy.get(
prefix + 'sparse_beta1_decay_rate', 0.9)
sgd.adam.beta2_decay_rate = strategy.get(
prefix + 'sparse_beta2_decay_rate', 0.999)
sgd.adam.ada_epsilon = strategy.get(
prefix + 'sparse_ada_epsilon', 1e-8)
bounds = strategy.get(prefix + 'sparse_weight_bounds',
[-10, 10])
sgd.adam.weight_bounds.extend(bounds)
def set_sparse_table_config(table_data, config): def set_sparse_table_config(table_data, config):
for key in config: for key in config:
......
...@@ -195,7 +195,7 @@ class Accessor: ...@@ -195,7 +195,7 @@ class Accessor:
sgd_param.naive.initial_range = 0.0001 sgd_param.naive.initial_range = 0.0001
if len(sgd_param.naive.weight_bounds) == 0: if len(sgd_param.naive.weight_bounds) == 0:
sgd_param.naive.weight_bounds.extend([-10.0, 10.0]) sgd_param.naive.weight_bounds.extend([-10.0, 10.0])
if sgd_param.name == "SparseAdamSGDRule": if sgd_param.name == "SparseAdamSGDRule" or sgd_param.name == "SparseSharedAdamSGDRule":
if not sgd_param.adam.HasField("learning_rate"): if not sgd_param.adam.HasField("learning_rate"):
sgd_param.adam.learning_rate = 0.001 sgd_param.adam.learning_rate = 0.001
if not sgd_param.adam.HasField("initial_range"): if not sgd_param.adam.HasField("initial_range"):
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import os
os.environ["WITH_DISTRIBUTE"] = "ON"
import unittest
import tempfile
import shutil
import paddle
import paddle.fluid as fluid
import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.distributed.fleet as fleet
paddle.enable_static()
# For Net
base_lr = 0.2
emb_lr = base_lr * 3
dict_dim = 1500
emb_dim = 128
hid_dim = 128
margin = 0.1
sample_rate = 1
batch_size = 4
# this unittest is tested for SparseSharedAdamSGDRule
class TestPSPassWithBow(unittest.TestCase):
def net(self):
def get_acc(cos_q_nt, cos_q_pt, batch_size):
cond = fluid.layers.less_than(cos_q_nt, cos_q_pt)
cond = fluid.layers.cast(cond, dtype='float64')
cond_3 = fluid.layers.reduce_sum(cond)
acc = fluid.layers.elementwise_div(cond_3,
fluid.layers.fill_constant(
shape=[1],
value=batch_size * 1.0,
dtype='float64'),
name="simnet_acc")
return acc
def get_loss(cos_q_pt, cos_q_nt):
loss_op1 = fluid.layers.elementwise_sub(
fluid.layers.fill_constant_batch_size_like(input=cos_q_pt,
shape=[-1, 1],
value=margin,
dtype='float32'),
cos_q_pt)
loss_op2 = fluid.layers.elementwise_add(loss_op1, cos_q_nt)
loss_op3 = fluid.layers.elementwise_max(
fluid.layers.fill_constant_batch_size_like(input=loss_op2,
shape=[-1, 1],
value=0.0,
dtype='float32'),
loss_op2)
avg_cost = fluid.layers.mean(loss_op3)
return avg_cost
is_distributed = False
is_sparse = True
# query
q = fluid.layers.data(name="query_ids",
shape=[1],
dtype="int64",
lod_level=1)
# embedding
q_emb = fluid.contrib.layers.sparse_embedding(
input=q,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr))
q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
# vsum
q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
q_ss = fluid.layers.softsign(q_sum)
# fc layer after conv
q_fc = fluid.layers.fc(
input=q_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__q_fc__",
learning_rate=base_lr))
# label data
label = fluid.layers.data(name="label", shape=[1], dtype="int64")
# pt
pt = fluid.layers.data(name="pos_title_ids",
shape=[1],
dtype="int64",
lod_level=1)
# embedding
pt_emb = fluid.contrib.layers.sparse_embedding(
input=pt,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr))
pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
# vsum
pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
pt_ss = fluid.layers.softsign(pt_sum)
# fc layer
pt_fc = fluid.layers.fc(
input=pt_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__fc__",
learning_rate=base_lr),
bias_attr=fluid.ParamAttr(name="__fc_b__"))
# nt
nt = fluid.layers.data(name="neg_title_ids",
shape=[1],
dtype="int64",
lod_level=1)
# embedding
nt_emb = fluid.contrib.layers.sparse_embedding(
input=nt,
size=[dict_dim, emb_dim],
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__emb__",
learning_rate=emb_lr))
nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
# vsum
nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
nt_ss = fluid.layers.softsign(nt_sum)
# fc layer
nt_fc = fluid.layers.fc(
input=nt_ss,
size=hid_dim,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0.01),
name="__fc__",
learning_rate=base_lr),
bias_attr=fluid.ParamAttr(name="__fc_b__"))
cos_q_pt = fluid.layers.cos_sim(q_fc, pt_fc)
cos_q_nt = fluid.layers.cos_sim(q_fc, nt_fc)
# loss
avg_cost = get_loss(cos_q_pt, cos_q_nt)
# acc
acc = get_acc(cos_q_nt, cos_q_pt, batch_size)
return [avg_cost, acc, cos_q_pt]
def test(self):
os.environ["PADDLE_PSERVER_NUMS"] = "2"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ["POD_IP"] = "127.0.0.1"
os.environ["PADDLE_PORT"] = "36001"
os.environ["PADDLE_TRAINER_ID"] = "0"
os.environ["PADDLE_TRAINERS_NUM"] = "2"
os.environ[
"PADDLE_PSERVERS_IP_PORT_LIST"] = "127.0.0.1:36001,127.0.0.2:36001"
os.environ["TRAINING_ROLE"] = "PSERVER"
role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)
loss, acc, _ = self.net()
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.a_sync = True
configs = {}
configs['__emb__'] = {
"table_parameters.__emb__.accessor.embed_sgd_param.name":
"SparseSharedAdamSGDRule",
"table_parameters.__emb__.accessor.embedx_sgd_param.name":
"SparseSharedAdamSGDRule",
}
strategy.sparse_table_configs = configs
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(loss)
fleet.init_server()
if __name__ == '__main__':
unittest.main()
...@@ -334,6 +334,14 @@ class TestStrategyConfig(unittest.TestCase): ...@@ -334,6 +334,14 @@ class TestStrategyConfig(unittest.TestCase):
strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad. strategy.sparse_table_configs[0].accessor.embed_sgd_param.adagrad.
initial_range, 0.0001) initial_range, 0.0001)
strategy = paddle.distributed.fleet.DistributedStrategy()
configs = {}
configs['emb'] = {"sparse_optimizer": "shared_adam"}
strategy.fleet_desc_configs = configs
self.assertEqual(
strategy.sparse_table_configs[0].accessor.embed_sgd_param.adam.
beta1_decay_rate, 0.9)
def test_trainer_desc_configs(self): def test_trainer_desc_configs(self):
strategy = paddle.distributed.fleet.DistributedStrategy() strategy = paddle.distributed.fleet.DistributedStrategy()
configs = { configs = {
......
...@@ -671,7 +671,8 @@ HIGH_PARALLEL_JOB_NEW = [ ...@@ -671,7 +671,8 @@ HIGH_PARALLEL_JOB_NEW = [
'test_trt_convert_reduce_sum', 'test_trt_convert_reduce_sum',
'save_quant2_model_lstm', 'save_quant2_model_lstm',
'test_trt_convert_slice', 'test_trt_convert_slice',
'test_quant2_int8_lstm_mkldnn' 'test_quant2_int8_lstm_mkldnn',
'test_dist_fleet_ps13'
] ]
# mem=0 but always timeout or failed : It run 15 job each time in Single cases; # mem=0 but always timeout or failed : It run 15 job each time in Single cases;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册