提交 51a86d2b 编写于 作者: A Aurelius84 提交者: hong

Optimize adam speed (#21777)

* optimize adam speed by removing _finish_update test=develop

* fix SparseAdamFunctor param list test=develop

* Remove scale_op in expect_list of adam_op test=develop

* fix test optimizer loss assert error test=develop

* fix test optimizer loss assert error test=develop

* modify PADDLE_ENFORCE usage test=develop

* fix op_type in lamb_op.cc test=develop

* fix errors ostream format bug test=develop

* add betaPowOut in ngraph op test=develop

* fix ngraph::op api for gcc8 test=develop

* clean code test=develop

* modify struct into class test=develop

* remove code of beta1Tensor in lamb_op test=develop
上级 310edc0d
develop 2.0.1-rocm-post Ligoml-patch-1 OliverLPH-patch-1 OliverLPH-patch-2 PaddlePM-patch-1 PaddlePM-patch-2 ZHUI-patch-1 add_default_att add_model_benchmark_ci add_some_yaml_config addfile all_new_design_exec ascendrc ascendrelease cherry_undefined_var compile_windows cp_2.4_fix_numpy delete_2.0.1-rocm-post delete_add_default_att delete_all_new_design_exec delete_ascendrc delete_compile_windows delete_delete_addfile delete_disable_iterable_dataset_unittest delete_fix_dataloader_memory_leak delete_fix_imperative_dygraph_error delete_fix_retry_ci delete_fix_undefined_var delete_improve_sccache delete_paralleltest delete_prv-disable-more-cache delete_revert-31068-fix_conv3d_windows delete_revert-31562-mean delete_revert-33630-bug-fix delete_revert-34159-add_npu_bce_logical_dev delete_revert-34910-spinlocks_for_allocator delete_revert-35069-revert-34910-spinlocks_for_allocator delete_revert-36057-dev/read_flags_in_ut dingjiaweiww-patch-1 disable_iterable_dataset_unittest dy2static enable_eager_model_test final_state_gen_python_c final_state_intermediate fix-numpy-issue fix_concat_slice fix_dataloader_memory_leak fix_dlpack_for fix_imperative_dygraph_error fix_npu_ci fix_op_flops fix_retry_ci fix_rnn_docs fix_tensor_type fix_undefined_var fix_var_stop_gradient_error fixiscan fixiscan1 fixiscan2 fixiscan3 github/fork/123malin/netifaces github/fork/123malin/tdm_abacus github/fork/AshburnLee/dev_unique github/fork/ForFishes/fix_memory_matmul github/fork/ForFishes/rm_fluid github/fork/LielinJiang/move-2.0-api github/fork/LielinJiang/visual-dl-cb github/fork/LiuChiachi/add-transformer-generate-square-subsequent-mask-api github/fork/LiuChiachi/fix-example-code-for-hapi-Model github/fork/LiuChiachi/remove-input-requirment-in-dygraph-Model github/fork/MrChengmo/fix_ps_profiler github/fork/MrChengmo/update_ps_heter github/fork/PWhiddy/patch-1 github/fork/Shixiaowei02/dev/save_load_upgrade github/fork/TCChenlong/fix_hapi github/fork/TCChenlong/fix_inden github/fork/Thunderbrook/xpu_slice github/fork/XieYunshen/disable_ut_test_parallel_executor_fetch_isolated_var github/fork/XieYunshen/disable_ut_test_parallel_executor_fetch_isolated_var_2 github/fork/XieYunshen/disable_ut_test_parallel_executor_fetch_isolated_var_3 github/fork/XieYunshen/timeout_20S_ut github/fork/ZeyuChen/remove-nltk github/fork/arlesniak/arlesniak/selective__mkldnn_flags github/fork/baiyfbupt/code_doc_mig github/fork/chalsliu/set_timeout github/fork/chen-zhiyu/develop github/fork/chenwhql/ci/try_to_find_test_buffer_shared_memory_reuse_pass_error github/fork/chenwhql/dygraph/remove_scale_loss_and_apply_collective_grads github/fork/chenwhql/saveload/add_get_inference_program github/fork/chenwhql/saveload/remove_save_load_config github/fork/cryoco/pass-compatibility-trt github/fork/danleifeng/isempty_api2.0 github/fork/frankwhzhang/api_transfer github/fork/hbwx24/error_msg/cuda_kernel_error_msg github/fork/heavengate/cherry_yolo_box github/fork/heavengate/update_yolo_box github/fork/iclementine/rnn_fix github/fork/iducn/testestse github/fork/jczaja/prv-25537-fix github/fork/jeff41404/release/1.8 github/fork/jiweibo/api_2.0 github/fork/jiweibo/fix_lite_resnet50_test github/fork/juncaipeng/fix_doc_1 github/fork/lfchener/sample_code github/fork/littletomatodonkey/fix_reg_doc github/fork/liym27/dy2stat_update_assign_to_rc20 github/fork/luotao1/profiler_ut github/fork/mapingshuo/add_wait github/fork/mapingshuo/doc_2.0 github/fork/mapingshuo/zero-0.5 github/fork/miraiwk/dev github/fork/pangyoki/add-Categorical-class-branch github/fork/pangyoki/add-multinomial-op-branch github/fork/pangyoki/fix-test_distritbution-CI github/fork/qjing666/doublegrad github/fork/qjing666/fix_hdfs_download github/fork/sandyhouse/add_gather_etc github/fork/sandyhouse/add_send_recv_alltoall_etc github/fork/sandyhouse/pipeline_exe_run github/fork/seiriosPlus/feature/large_scale_kv_save_delta github/fork/seiriosPlus/fix/paddle_errors_fix github/fork/seiriosPlus/fix/paddle_op_errors github/fork/shangzhizhou/fix_test_activation_op_random_bug github/fork/smallv0221/yxp0924 github/fork/smallv0221/yxp0925 github/fork/swtkiwi/del-matplotlib github/fork/tianshuo78520a/kunlun_test github/fork/tianshuo78520a/update_dockerfile github/fork/wanghaoshuang/bert_fuse github/fork/wanghaoshuang/label_smooth github/fork/wanghuancoder/develop_CUDASynchronize github/fork/wanghuancoder/develop_Layer_doc github/fork/wanghuancoder/develop_ParameterList_doc github/fork/wanghuancoder/develop_Sequential_doc github/fork/wanghuancoder/develop_bilinear_tensor_product github/fork/wanghuancoder/develop_coverage_build_sh github/fork/wanghuancoder/develop_in_dynamic_mode_doc github/fork/wanghuancoder/develop_unique_name_doc github/fork/wangxicoding/fleet_meta_combine github/fork/wawltor/error_message_fix_5 github/fork/willthefrog/remove_l2_norm github/fork/windstamp/momentum_op github/fork/windstamp/mv_op_5 github/fork/windstamp/normal_api github/fork/wojtuss/wojtuss/fusion_gru_quantization github/fork/wojtuss/wojtuss/quantization-with-shift github/fork/wzzju/fix_err_info github/fork/wzzju/pure_fp16 github/fork/xiemoyuan/op_error_message github/fork/xiemoyuan/optimize_error_message github/fork/yaoxuefeng6/fix_doc github/fork/yaoxuefeng6/mod_dataset_v2 github/fork/yongqiangma/lod github/fork/ysh329/fix-clip-by-norm-error github/fork/ysh329/fix-error-clip-by-value github/fork/yukavio/error_info github/fork/zhangting2020/conv_filter_grad github/fork/zhangting2020/is_compile_with_cuda github/fork/zhangting2020/place_doc github/fork/zhangting2020/program github/fork/zhhsplendid/fix_any github/fork/zhhsplendid/refine_api2 github/fork/zhhsplendid/refine_api2_test github/fork/zhhsplendid/refine_api_test_ptb_lm github/fork/zhhsplendid/refine_api_test_resnet github/fork/zhhsplendid/refine_api_test_simnet github/fork/zhiqiu/dev/refine_initializer github/fork/zhiqiu/dev/remove_inplace_argument github/fork/zlsh80826/nvinfer_plugin_var_len_cuda11 improve_sccache incubate/frl_train_eval incubate/infrt inplace_addto layer_norm make_flag_adding_easier matmul_double_grad move_embedding_to_phi move_histogram_to_pten move_sgd_to_phi move_slice_to_pten move_temporal_shift_to_phi move_yolo_box_to_phi npu_fix_alloc numel paralleltest preln_ernie prv-disable-more-cache prv-md-even-more prv-onednn-2.5 prv-reshape-mkldnn-ut2 pten_tensor_refactor release/1.7 release/1.8 release/2.0 release/2.0-alpha release/2.0-beta release/2.0-rc release/2.0-rc1 release/2.1 release/2.2 release/2.3 release/2.3-fc-ernie-fix release/2.4 revert-24981-add_device_attr_for_regulization revert-26856-strategy_example2 revert-27520-disable_pr revert-31068-fix_conv3d_windows revert-31562-mean revert-32290-develop-hardlabel revert-33037-forci revert-33475-fix_cifar_label_dimension revert-33630-bug-fix revert-34159-add_npu_bce_logical_dev revert-34406-add_copy_from_tensor revert-34910-spinlocks_for_allocator revert-35069-revert-34910-spinlocks_for_allocator revert-36057-dev/read_flags_in_ut revert-36201-refine_fast_threaded_ssa_graph_executor revert-36985-add_license revert-37318-refactor_dygraph_to_eager revert-37926-eager_coreops_500 revert-37956-revert-37727-pylayer_support_tuple revert-38100-mingdong revert-38301-allocation_rearrange_pr revert-38703-numpy_bf16_package_reupload revert-38732-remove_useless_header_in_elementwise_mul_grad revert-38959-Reduce_Grad revert-39143-adjust_empty revert-39227-move_trace_op_to_pten revert-39268-dev/remove_concat_fluid_kernel revert-40170-support_partial_grad revert-41056-revert-40727-move_some_activaion_to_phi revert-41065-revert-40993-mv_ele_floordiv_pow revert-41068-revert-40790-phi_new revert-41944-smaller_inference_api_test revert-42149-do-not-reset-default-stream-for-stream-safe-cuda-allocator revert-43155-fix_ut_tempfile revert-43882-revert-41944-smaller_inference_api_test revert-45808-phi/simplify_size_op revert-46827-deform_comment revert-47325-remove_cudnn_hardcode revert-47645-add_npu_storage_dims revert-48815-set_free_when_no_cache_hit_default_value_true revert-49654-prim_api_gen revert-49763-fix_static_composite_gen rocm_dev_0217 support-0D-sort support_weight_transpose test_benchmark_ci test_feature_precision_test_c test_for_Filtetfiles test_model_benchmark test_model_benchmark_ci zhiqiu-patch-1 v2.4.1 v2.4.0 v2.4.0-rc0 v2.3.2 v2.3.1 v2.3.0 v2.3.0-rc0 v2.2.2 v2.2.1 v2.2.0 v2.2.0-rc0 v2.2.0-bak0 v2.1.3 v2.1.2 v2.1.1 v2.1.0 v2.1.0-rc0 v2.0.2 v2.0.1 v2.0.0 v2.0.0-rc1 v2.0.0-rc0 v2.0.0-beta0 v2.0.0-alpha0 v1.8.5 v1.8.4 v1.8.3 v1.8.2 v1.8.1 v1.8.0 v1.7.2 v1.7.1 v1.7.0
无相关合并请求
......@@ -39,13 +39,6 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
const std::vector<ir::Node *> &adam_ops, ir::Graph *graph) const {
auto fused_adam_node =
FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph);
auto fused_scale1 =
FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"),
adam_ops, graph);
auto fused_scale2 =
FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"),
adam_ops, graph);
RemoveCycleDepsBetweenOpNodes(graph, fused_scale1, fused_scale2);
return fused_adam_node;
}
......@@ -139,6 +132,8 @@ class FuseAdamOpPass : public FuseOptimizerOpPass {
adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
adam_desc.SetOutput("Beta1PowOut", {fused_vars_name.at("Beta1Pow")});
adam_desc.SetOutput("Beta2PowOut", {fused_vars_name.at("Beta2Pow")});
adam_desc.SetAttr("beta1", beta1);
adam_desc.SetAttr("beta2", beta2);
adam_desc.SetAttr("epsilon", epsilon);
......
......@@ -416,6 +416,8 @@ void FuseOptimizerOpPass::FuseVarsToContinuousSpace(
result->Get<details::ProgramDescs>(details::kProgramDescs).back();
auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) {
VLOG(6) << "aux_var_names : " << var_name
<< ". fused_vars_name: " << fused_vars_name.at(var_name);
AppendCoalesceTensorOp(aux_var_map.at(var_name), aux_var_map.at(var_name),
fused_vars_name.at(var_name), dtype, global_block,
true);
......
......@@ -68,9 +68,14 @@ void BuildAdamNode(
auto delta = ElementwiseScalar<ngraph::op::Multiply>(updated_lr, param_grad);
auto param_out = std::make_shared<ngraph::op::Subtract>(param, delta);
auto beta1_pow_out = ElementwiseScalar<ngraph::op::Multiply>(beta1, beta1pow);
auto beta2_pow_out = ElementwiseScalar<ngraph::op::Multiply>(beta2, beta2pow);
platform::SetOutputNode(op, "Moment1Out", moment1out, ngb_node_map);
platform::SetOutputNode(op, "Moment2Out", moment2out, ngb_node_map);
platform::SetOutputNode(op, "ParamOut", param_out, ngb_node_map);
platform::SetOutputNode(op, "Beta1PowOut", beta1_pow_out, ngb_node_map);
platform::SetOutputNode(op, "Beta2PowOut", beta2_pow_out, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
......
......@@ -66,37 +66,63 @@ void AdamOp::InferShape(framework::InferShapeContext* ctx) const {
"Output(Moment2Out) of AdamOp should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(framework::product(lr_dims), 0,
"Maybe the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function.");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 dimension");
PADDLE_ENFORCE_NE(
framework::product(lr_dims), 0,
platform::errors::InvalidArgument(
"The number of LearningRate shall not be 0, but received %d. Maybe "
"the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function.",
framework::product(lr_dims)));
PADDLE_ENFORCE_EQ(
framework::product(lr_dims), 1,
platform::errors::InvalidArgument(
"Learning rate should have 1 dimension, but received %d",
framework::product(lr_dims)));
auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
"Beta1 power accumulator should have 1 dimension");
VLOG(3) << "dims of Beta1Pow : [" << beta1_pow_dims << "]";
PADDLE_ENFORCE_GE(framework::product(beta1_pow_dims), 1,
platform::errors::InvalidArgument(
"The size of Beta1 power accumulator should be greater "
"than 0, but received %d.",
framework::product(beta1_pow_dims)));
auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
"Beta2 power accumulator should have 1 dimension");
VLOG(3) << "dims of Beta2Pow : [" << beta2_pow_dims << "]";
PADDLE_ENFORCE_GE(framework::product(beta2_pow_dims), 1,
platform::errors::InvalidArgument(
"The size of Beta2 power accumulator should be greater "
"than 0, but received %d.",
framework::product(beta2_pow_dims)));
auto param_dims = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"),
"Param and Grad input of AdamOp should have same dimension");
platform::errors::InvalidArgument(
"Param and Grad input of AdamOp should have same dimension. But "
"received Param dims: [%s], Grad dims: [%s].",
param_dims, ctx->GetInputDim("Grad")));
}
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment1"),
"Param and Moment1 input of AdamOp should have same dimension");
platform::errors::InvalidArgument(
"Param and Moment1 input of AdamOp should have same dimension. But "
"received Param dims: [%s], Moment1 dims: [%s].",
param_dims, ctx->GetInputDim("Moment1")));
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment2"),
"Param and Moment2 input of AdamOp should have same dimension");
platform::errors::InvalidArgument(
"Param and Moment2 input of AdamOp should have same dimension. But "
"received Param dims: [%s], Moment2 dims: [%s].",
param_dims, ctx->GetInputDim("Moment2")));
ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("Moment1Out", param_dims);
ctx->SetOutputDim("Moment2Out", param_dims);
ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims);
}
framework::OpKernelType AdamOp::GetExpectedKernelType(
......@@ -130,6 +156,8 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("ParamOut", "(Tensor) Output parameter");
AddOutput("Moment1Out", "(Tensor) Output first moment");
AddOutput("Moment2Out", "(Tensor) Output second moment");
AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
AddAttr<float>("beta1",
"(float, default 0.9) "
......
......@@ -52,10 +52,48 @@ struct GPUAdam;
struct CPUAdam;
template <typename T, typename Flavour>
struct AdamFunctor;
class AdamFunctor;
template <typename T>
struct AdamFunctor<T, GPUAdam> {
class BetaPowFunctor {
private:
T beta1_;
T beta2_;
const T* beta1_pow_;
const T* beta2_pow_;
T* beta1_pow_out_;
T* beta2_pow_out_;
public:
BetaPowFunctor(T beta1, T beta2, const T* beta1_pow, const T* beta2_pow,
T* beta1_pow_out, T* beta2_pow_out)
: beta1_(beta1),
beta2_(beta2),
beta1_pow_(beta1_pow),
beta2_pow_(beta2_pow),
beta1_pow_out_(beta1_pow_out),
beta2_pow_out_(beta2_pow_out) {}
inline HOSTDEVICE void update_step(size_t i) const {
T beta1_pow_i = beta1_pow_[i];
T beta2_pow_i = beta2_pow_[i];
beta1_pow_out_[i] = beta1_pow_i * beta1_;
beta2_pow_out_[i] = beta2_pow_i * beta2_;
}
inline HOSTDEVICE void operator()(size_t i) const { update_step(i); }
inline HOSTDEVICE void apply_update(size_t limit) const {
for (size_t i = 0; i < limit; ++i) {
update_step(i);
}
}
};
template <typename T>
class AdamFunctor<T, GPUAdam> {
private:
T beta1_;
T beta2_;
T epsilon_;
......@@ -71,6 +109,7 @@ struct AdamFunctor<T, GPUAdam> {
const T* param_;
T* param_out_;
public:
AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
T* mom2_out, const T* lr, const T* grad, const T* param,
......@@ -114,7 +153,8 @@ struct AdamFunctor<T, GPUAdam> {
};
template <typename T>
struct AdamFunctor<T, CPUAdam> {
class AdamFunctor<T, CPUAdam> {
private:
T beta1_;
T beta2_;
T epsilon_;
......@@ -130,6 +170,7 @@ struct AdamFunctor<T, CPUAdam> {
const T* param_;
T* param_out_;
public:
AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
T* mom2_out, const T* lr, const T* grad, const T* param,
......@@ -179,10 +220,11 @@ struct AdamFunctor<T, CPUAdam> {
};
template <typename T, typename Flavour>
struct SparseAdamFunctor;
class SparseAdamFunctor;
template <typename T>
struct SparseAdamFunctor<T, GPUAdam> {
class SparseAdamFunctor<T, GPUAdam> {
private:
T beta1_;
T beta2_;
T epsilon_;
......@@ -203,6 +245,7 @@ struct SparseAdamFunctor<T, GPUAdam> {
int64_t row_count_;
bool lazy_mode_;
public:
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad,
......@@ -261,7 +304,8 @@ struct SparseAdamFunctor<T, GPUAdam> {
};
template <typename T>
struct SparseAdamFunctor<T, CPUAdam> {
class SparseAdamFunctor<T, CPUAdam> {
private:
T beta1_;
T beta2_;
T epsilon_;
......@@ -281,6 +325,7 @@ struct SparseAdamFunctor<T, CPUAdam> {
int64_t row_numel_;
int64_t row_count_;
public:
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad,
......@@ -397,6 +442,10 @@ class AdamOpKernel : public framework::OpKernel<T> {
Ref(ctx.Output<LoDTensor>("Moment1Out"), "Must set Moment1Out");
auto& mom2_out =
Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
auto& beta1_pow_out =
Ref(ctx.Output<LoDTensor>("Beta1PowOut"), "Must set Beta1PowOut");
auto& beta2_pow_out =
Ref(ctx.Output<LoDTensor>("Beta2PowOut"), "Must set Beta2PowOut");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
if (ctx.HasInput("Beta1Tensor")) {
......@@ -408,6 +457,14 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
beta2 = static_cast<T>(GetAttrFromTensor(beta2_tensor));
}
VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel()
<< "beta2_pow.numel() : " << beta2_pow.numel();
VLOG(3) << "param.numel(): " << param.numel();
BetaPowFunctor<T> beta_functor(
beta1, beta2, beta1_pow.template data<T>(),
beta2_pow.template data<T>(),
beta1_pow_out.template mutable_data<T>(ctx.GetPlace()),
beta2_pow_out.template mutable_data<T>(ctx.GetPlace()));
if (grad_var->IsType<framework::LoDTensor>()) {
auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
......@@ -423,6 +480,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()));
functor(param.numel());
beta_functor.apply_update(beta2_pow.numel());
} else if (platform::is_gpu_place(ctx.GetPlace())) {
AdamFunctor<T, GPUAdam> functor(
beta1, beta2, epsilon, beta1_pow.template data<T>(),
......@@ -433,11 +491,16 @@ class AdamOpKernel : public framework::OpKernel<T> {
lr.template data<T>(), grad.template data<T>(),
param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()));
// update param and moment
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
param.numel());
for_range(functor);
// update beta1 and beta2
platform::ForRange<DeviceContext> for_range_beta(
static_cast<const DeviceContext&>(ctx.device_context()),
beta2_pow.numel());
for_range_beta(beta_functor);
}
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto& grad =
......@@ -485,6 +548,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size(), lazy_mode);
// update beta1 and beta2
beta_functor.apply_update(beta2_pow.numel());
if (lazy_mode) {
VLOG(3) << "run cpu lazy mode";
size_t row_count = grad_merge.rows().size();
......@@ -574,6 +639,11 @@ class AdamOpKernel : public framework::OpKernel<T> {
static_cast<const DeviceContext&>(ctx.device_context()),
param.numel());
for_range(functor);
// update beta1 and beta2
platform::ForRange<DeviceContext> for_range_beta(
static_cast<const DeviceContext&>(ctx.device_context()),
beta2_pow.numel());
for_range_beta(beta_functor);
}
} else {
PADDLE_THROW("Variable type not supported by adam_op");
......
......@@ -13,11 +13,111 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/lamb_op.h"
#include "paddle/fluid/operators/optimizers/adam_op.h"
namespace paddle {
namespace operators {
class LambOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
platform::errors::NotFound(
"Input(Param) of LambOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
platform::errors::NotFound(
"Input(Grad) of LambOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Moment1"), true,
platform::errors::NotFound(
"Input(Moment1) of LambOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Moment2"), true,
platform::errors::NotFound(
"Input(Moment2) of LambOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
platform::errors::NotFound(
"Input(LearningRate) of LambOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Beta1Pow"), true,
platform::errors::NotFound(
"Input(Beta1Pow) of LambOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Beta2Pow"), true,
platform::errors::NotFound(
"Input(Beta2Pow) of LambOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(ParamOut) of LambOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment1Out"), true,
platform::errors::NotFound(
"Output(Moment1Out) of LambOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment2Out"), true,
platform::errors::NotFound(
"Output(Moment2Out) of LambOp should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(
framework::product(lr_dims), 0,
platform::errors::InvalidArgument(
"The number of LearningRate shall not be 0, but received %d. Maybe "
"the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function.",
framework::product(lr_dims)));
PADDLE_ENFORCE_EQ(
framework::product(lr_dims), 1,
platform::errors::InvalidArgument(
"Learning rate should have 1 dimension, but received %d.",
framework::product(lr_dims)));
auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
PADDLE_ENFORCE_GE(framework::product(beta1_pow_dims), 1,
platform::errors::InvalidArgument(
"The size of Beta1 power accumulator should be "
"greater than 0, but received %d.",
framework::product(beta1_pow_dims)));
auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
PADDLE_ENFORCE_GE(framework::product(beta2_pow_dims), 1,
platform::errors::InvalidArgument(
"The size of Beta2 power accumulator should be "
"greater than 0, but received %d.",
framework::product(beta2_pow_dims)));
auto param_dims = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"),
platform::errors::InvalidArgument(
"Param and Grad input of LambOp should have same dimension. But "
"received Param dims: [%s], Grad dims: [%s].",
param_dims, ctx->GetInputDim("Grad")));
}
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment1"),
platform::errors::InvalidArgument(
"Param and Moment1 input of LambOp should have same dimension. But "
"received Param dims: [%s], Moment1 dims: [%s].",
param_dims, ctx->GetInputDim("Moment1")));
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment2"),
platform::errors::InvalidArgument(
"Param and Moment2 input of LambOp should have same dimension. But "
"received Param dims: [%s], Moment2 dims: [%s].",
param_dims, ctx->GetInputDim("Moment2")));
ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("Moment1Out", param_dims);
ctx->SetOutputDim("Moment2Out", param_dims);
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
auto input_data_type =
OperatorWithKernel::IndicateVarDataType(ctx, "Param");
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
};
class LambOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
......@@ -79,7 +179,7 @@ learning rate, $\lambda$ the weight decay rate.
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(lamb, ops::AdamOp, ops::LambOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(lamb, ops::LambOp, ops::LambOpMaker);
REGISTER_OP_CPU_KERNEL(
lamb, ops::LambOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::LambOpKernel<paddle::platform::CPUDeviceContext, double>);
......@@ -1683,7 +1683,9 @@ class AdamOptimizer(Optimizer):
outputs = {
"ParamOut": param_and_grad[0],
"Moment1Out": moment1,
"Moment2Out": moment2
"Moment2Out": moment2,
"Beta1PowOut": beta1_pow_acc,
"Beta2PowOut": beta2_pow_acc,
}
attrs = {
"epsilon": self._epsilon,
......@@ -1709,46 +1711,6 @@ class AdamOptimizer(Optimizer):
return adam_op
def _finish_update(self, block, param_and_grads):
"""Update Beta1 and Beta2 Power accumulators
"""
assert isinstance(block, framework.Block)
main_block = block.program.global_block()
for param, grad in param_and_grads:
if grad is None or param.trainable is False:
continue
with param.block.program._optimized_guard(
[param, grad]), name_scope("optimizer"):
beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
param)
beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
param)
inputs = {"X": beta1_pow_acc}
attrs = {}
if isinstance(self._beta1, Variable):
inputs['ScaleTensor'] = self._beta1
else:
attrs['scale'] = self._beta1
main_block.append_op(
type="scale",
inputs=inputs,
outputs={"Out": beta1_pow_acc},
attrs=attrs,
stop_gradient=True)
inputs = {"X": beta2_pow_acc}
attrs = {}
if isinstance(self._beta2, Variable):
inputs['ScaleTensor'] = self._beta2
else:
attrs['scale'] = self._beta2
main_block.append_op(
type="scale",
inputs=inputs,
outputs={"Out": beta2_pow_acc},
attrs=attrs,
stop_gradient=True)
class AdamaxOptimizer(Optimizer):
"""
......
......@@ -58,7 +58,9 @@ class TestAdamOp1(OpTest):
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
}
def test_check_output(self):
......@@ -101,7 +103,9 @@ class TestAdamOp2(OpTest):
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
}
def test_check_output(self):
......@@ -122,11 +126,11 @@ class TestAdamOpMultipleSteps(OpTest):
moment2 = np.random.random((102, 105)).astype("float32")
learning_rate = 0.001
beta1 = 0.9
beta2 = 0.999
self.beta1 = 0.9
self.beta2 = 0.999
epsilon = 1e-8
beta1_pow = beta1**10
beta2_pow = beta2**10
self.beta1_pow = self.beta1**10
self.beta2_pow = self.beta2**10
self.inputs = {
'Param': param,
......@@ -134,21 +138,29 @@ class TestAdamOpMultipleSteps(OpTest):
'Moment1': moment1,
'Moment2': moment2,
'LearningRate': np.array([learning_rate]).astype("float32"),
'Beta1Pow': np.array([beta1_pow]).astype("float32"),
'Beta2Pow': np.array([beta2_pow]).astype("float32")
'Beta1Pow': np.array([self.beta1_pow]).astype("float32"),
'Beta2Pow': np.array([self.beta2_pow]).astype("float32")
}
self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
self.attrs = {
'epsilon': epsilon,
'beta1': self.beta1,
'beta2': self.beta2
}
def test_check_output(self):
for _ in range(self.num_steps):
param_out, moment1_out, \
moment2_out = adam_step(self.inputs, self.attrs)
beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1
beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out
'ParamOut': param_out,
'Beta1PowOut': beta1_pow_out,
'Beta2PowOut': beta2_pow_out
}
# Verify output for this step
......@@ -160,8 +172,8 @@ class TestAdamOpMultipleSteps(OpTest):
self.inputs['Moment2'] = moment2_out
# Update powers of Beta1 and Beta2 for next time step
self.inputs['Beta1Pow'] *= self.attrs['beta1']
self.inputs['Beta2Pow'] *= self.attrs['beta1']
self.inputs['Beta1Pow'] = beta1_pow_out
self.inputs['Beta2Pow'] = beta2_pow_out
# Randomize gradient for next step
self.inputs['Grad'] = np.random.uniform(
......@@ -254,6 +266,8 @@ class TestSparseAdamOp(unittest.TestCase):
beta1 = 0.78
beta2 = 0.836
epsilon = 1e-4
beta1_pow = np.array([beta1**10]).astype("float32")
beta2_pow = np.array([beta2**10]).astype("float32")
height = 10
rows = [0, 4, 7]
......@@ -264,8 +278,8 @@ class TestSparseAdamOp(unittest.TestCase):
"Param": np.full((height, row_numel), 5.0).astype("float32"),
"Moment1": np.full((height, row_numel), 5.0).astype("float32"),
"Moment2": np.full((height, row_numel), 5.0).astype("float32"),
'Beta1Pow': np.array([beta1**10]).astype("float32"),
'Beta2Pow': np.array([beta2**10]).astype("float32"),
'Beta1Pow': beta1_pow,
'Beta2Pow': beta2_pow,
"LearningRate": np.full((1), 2.0).astype("float32")
}
self.init_output = np.full((height, row_numel), 0.0).astype("float32")
......@@ -294,7 +308,9 @@ class TestSparseAdamOp(unittest.TestCase):
self.outputs = {
"ParamOut": param_out,
"Moment1Out": mom1,
"Moment2Out": mom2
"Moment2Out": mom2,
'Beta1PowOut': beta1_pow * beta1,
'Beta2PowOut': beta2_pow * beta2
}
def check_with_place(self, place, lazy_mode):
......@@ -376,7 +392,9 @@ class TestAdamOpBetaVariable(OpTest):
self.outputs = {
'Moment1Out': moment1_out,
'Moment2Out': moment2_out,
'ParamOut': param_out
'ParamOut': param_out,
'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
}
def test_check_output(self):
......
......@@ -320,9 +320,8 @@ class TestAdamOptimizer(unittest.TestCase):
self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
with framework.program_guard(program, init_program):
opts = adam_optimizer.apply_gradients(params_grads)
self.assertEqual(len(opts), 4)
self.assertEqual([op.type for op in opts],
["scale", "adam", "scale", "scale"])
self.assertEqual(len(opts), 2)
self.assertEqual([op.type for op in opts], ["scale", "adam"])
# Check accumulators
accumulators = adam_optimizer.get_accumulators()
......
......@@ -68,7 +68,7 @@ class TestTrainable(unittest.TestCase):
test_trainable,
feed_dict,
op_count={'adam': 1,
'scale': 2,
'scale': 0,
'mul_grad': 0})
self.check_trainable(
test_trainable,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部