diff --git a/paddle/fluid/operators/adam_op.h b/paddle/fluid/operators/adam_op.h index b332b6716369ca355454dc57fbd6cc2cbc71c658..f82ff47b52490c354f383515d430d14e24cbf6af 100644 --- a/paddle/fluid/operators/adam_op.h +++ b/paddle/fluid/operators/adam_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include // for sqrt in CPU and CUDA +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" @@ -24,8 +25,14 @@ namespace operators { namespace scatter = paddle::operators::math::scatter; +struct GPUAdam; +struct CPUAdam; + +template +struct AdamFunctor; + template -struct AdamFunctor { +struct AdamFunctor { T beta1_; T beta2_; T epsilon_; @@ -71,6 +78,7 @@ struct AdamFunctor { // Calculation lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); @@ -82,6 +90,71 @@ struct AdamFunctor { } }; +template +struct AdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, + T* mom2_out, const T* lr, const T* grad, const T* param, + T* param_out) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out) {} + + void operator()(size_t numel) const { + Eigen::Map> g{ + grad_, static_cast(numel)}; + Eigen::Map> mom1{ + moment1_, static_cast(numel)}; + Eigen::Map> mom2{ + moment2_, static_cast(numel)}; + Eigen::Map> param{ + param_, static_cast(numel)}; + + Eigen::Map> param_out{ + param_out_, static_cast(numel)}; + Eigen::Map> moment1_out{ + moment1_out_, static_cast(numel)}; + Eigen::Map> moment2_out{ + moment2_out_, static_cast(numel)}; + + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + + moment1_out = beta1_ * mom1 + (1 - beta1_) * g; + moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g; + param_out = param - lr * (moment1_out / (moment2_out.sqrt() + epsilon_)); + } +}; + template struct SparseAdamFunctor { T beta1_; @@ -134,6 +207,7 @@ struct SparseAdamFunctor { T p = param_[rows_[i] * row_numel_ + j]; lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); @@ -177,19 +251,34 @@ class AdamOpKernel : public framework::OpKernel { if (grad_var->IsType()) { auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); - AdamFunctor functor( - beta1, beta2, epsilon, beta1_pow.template data(), - beta2_pow.template data(), mom1.template data(), - mom1_out.template mutable_data(ctx.GetPlace()), - mom2.template data(), - mom2_out.template mutable_data(ctx.GetPlace()), - lr.template data(), grad.template data(), - param.template data(), - param_out.template mutable_data(ctx.GetPlace())); - platform::ForRange for_range( - static_cast(ctx.device_context()), - param.numel()); - for_range(functor); + + if (platform::is_cpu_place(ctx.GetPlace())) { + AdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad.template data(), + param.template data(), + param_out.template mutable_data(ctx.GetPlace())); + functor(param.numel()); + } else if (platform::is_gpu_place(ctx.GetPlace())) { + AdamFunctor functor( + beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad.template data(), + param.template data(), + param_out.template mutable_data(ctx.GetPlace())); + + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } } else if (grad_var->IsType()) { auto& grad = Ref(ctx.Input("Grad"), "Must set Grad");