From 77a8a3944a01ac2cb3a62c99cd7de459872a01b8 Mon Sep 17 00:00:00 2001 From: zhaoyingli <86812880+lili0826@users.noreply.github.com> Date: Mon, 23 Aug 2021 12:04:01 +0800 Subject: [PATCH] add adamw cuda kernel (#35020) * adamw support cuda * adamw support cuda --- paddle/fluid/operators/optimizers/adamw_op.cu | 438 ++++++++++++++++++ paddle/fluid/operators/optimizers/adamw_op.h | 104 ++++- paddle/fluid/pybind/op_function_generator.cc | 2 + .../fluid/tests/unittests/test_adamw_op.py | 27 -- python/paddle/optimizer/adamw.py | 82 +--- 5 files changed, 551 insertions(+), 102 deletions(-) create mode 100644 paddle/fluid/operators/optimizers/adamw_op.cu diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu new file mode 100644 index 00000000000..af2bb93e06d --- /dev/null +++ b/paddle/fluid/operators/optimizers/adamw_op.cu @@ -0,0 +1,438 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/optimizers/adamw_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +__global__ void AdamWKernelREG(MT beta1, MT beta2, MT epsilon, MT coeff, + MT beta1_pow_, MT beta2_pow_, const MT* moment1, + MT* moment1_out, const MT* moment2, + MT* moment2_out, const MT* lr_, const T* grad, + const T* param, T* param_out, + const MT* master_param, MT* master_param_out, + int ndim) { + MT lr = *lr_; + MT beta1_pow = beta1_pow_; + MT beta2_pow = beta2_pow_; + + MT wd = static_cast(1.0) - coeff * lr; + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = moment1[id]; + MT mom2 = moment2[id]; + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + p = wd * p - + lr * (mom1 / + (sqrt(mom2) + epsilon * sqrt(static_cast(1.0) - beta2_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} + +template +__global__ void AdamWKernelMEM(MT beta1, MT beta2, MT epsilon, MT coeff, + const MT* beta1_pow_, const MT* beta2_pow_, + const MT* moment1, MT* moment1_out, + const MT* moment2, MT* moment2_out, + const MT* lr_, const T* grad, const T* param, + T* param_out, const MT* master_param, + MT* master_param_out, int ndim) { + MT lr = *lr_; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + + MT wd = static_cast(1.0) - coeff * lr; + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = static_cast(moment1[id]); + MT mom2 = static_cast(moment2[id]); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + p = wd * p - + lr * (mom1 / + (sqrt(mom2) + epsilon * sqrt(static_cast(1.0) - beta2_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} +template +__global__ void UpdateAdamWBetaPow(T beta1, T beta2, const T* beta1_pow_, + const T* beta2_pow_, T* beta1_pow_out, + T* beta2_pow_out) { + *beta1_pow_out = beta1 * beta1_pow_[0]; + *beta2_pow_out = beta2 * beta2_pow_[0]; +} + +template +__global__ void SparseAdamWCUDAKernelREG( + MT beta1, MT beta2, MT epsilon, MT coeff, const MT beta1_pow, + const MT beta2_pow, const MT* mom1_, MT* mom1_out_, const MT* mom2_, + MT* mom2_out_, const MT* lr_, const T* grad_, const T* param_, + T* param_out_, const MT* master_param, MT* master_param_out, + const int64_t* rows_, int64_t row_numel, int64_t row_count, bool lazy_mode, + int ndim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + MT lr = *lr_; + + MT wd = static_cast(1.0) - coeff * lr; + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + for (; id < ndim; id += blockDim.x * gridDim.x) { + auto row_idx = + math::BinarySearch(rows_, row_count, id / row_numel); + if (lazy_mode && row_idx < 0) { + return; + } else { + MT mom1 = mom1_[id]; + MT mom2 = mom2_[id]; + MT p = master_param ? master_param[id] : static_cast(param_[id]); + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel + id % row_numel]) + : static_cast(0); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + p = wd * p - + lr * (mom1 / (sqrt(mom2) + + epsilon * sqrt(static_cast(1.0) - beta2_pow))); + + // Write back to global memory + mom1_out_[id] = mom1; + mom2_out_[id] = mom2; + param_out_[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } + } +} + +template +class AdamWOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + + using paddle::framework::LoDTensor; + using MPDType = typename details::MPTypeTrait::Type; + + int64_t min_row_size_to_use_multithread = + ctx.Attr("min_row_size_to_use_multithread"); + bool lazy_mode = ctx.Attr("lazy_mode"); + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + float coeff = ctx.Attr("coeff"); + + auto* param = ctx.Input("Param"); + auto* grad_var = ctx.InputVar("Grad"); + auto* mom1 = ctx.Input("Moment1"); + auto* mom2 = ctx.Input("Moment2"); + auto* lr = ctx.Input("LearningRate"); + + auto* beta1_pow = ctx.Input("Beta1Pow"); + auto* beta2_pow = ctx.Input("Beta2Pow"); + + auto* param_out = ctx.Output("ParamOut"); + auto* mom1_out = ctx.Output("Moment1Out"); + auto* mom2_out = ctx.Output("Moment2Out"); + auto* beta1_pow_out = ctx.Output("Beta1PowOut"); + auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + TensorToVector(*skip_update_tensor, ctx.device_context(), + &skip_update_vec); + skip_update = skip_update_vec[0]; + } + + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adamw skip update"; + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); + framework::TensorCopy( + *beta1_pow, ctx.GetPlace(), + ctx.template device_context(), + beta1_pow_out); + framework::TensorCopy( + *beta2_pow, ctx.GetPlace(), + ctx.template device_context(), + beta2_pow_out); + return; + } + + // if with_decay = false, coeff = 0 + bool with_decay = ctx.Attr("with_decay"); + if (!with_decay) { + coeff = static_cast(0.0); + } + + MPDType beta1 = static_cast(ctx.Attr("beta1")); + if (ctx.HasInput("Beta1Tensor")) { + auto* beta1_tensor = ctx.Input("Beta1Tensor"); + PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta1Tensor) size must be 1, but get %d", + beta1_tensor->numel())); + beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + } + MPDType beta2 = static_cast(ctx.Attr("beta2")); + if (ctx.HasInput("Beta2Tensor")) { + auto* beta2_tensor = ctx.Input("Beta2Tensor"); + PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta2Tensor) size must be 1, but get %d", + beta2_tensor->numel())); + beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + } + MPDType epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } + VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() + << "beta2_pow.numel() : " << beta2_pow->numel(); + VLOG(3) << "param.numel(): " << param->numel(); + PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const bool multi_precision = ctx.Attr("multi_precision"); + const LoDTensor* master_param = nullptr; + LoDTensor* master_param_out = nullptr; + if (multi_precision) { + bool has_master = + ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); + PADDLE_ENFORCE_EQ(has_master, true, + platform::errors::InvalidArgument( + "The Input(MasterParam) and Output(MasterParamOut) " + "should not be null when " + "the attr `multi_precision` is true")); + master_param = ctx.Input("MasterParam"); + master_param_out = ctx.Output("MasterParamOut"); + } + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision + ? master_param_out->mutable_data(ctx.GetPlace()) + : nullptr; + + auto& dev_ctx = ctx.template device_context(); + + if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); + + // update param and moment + int threads = 512; + int blocks = (param->numel() + threads - 1) / threads; + + if (beta1_pow->place() == platform::CPUPlace() && + beta2_pow->place() == platform::CPUPlace()) { + // Compute with betapow in REG + AdamWKernelREG<<>>( + beta1, beta2, epsilon, coeff, *beta1_pow->data(), + *beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad->data(), param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, param->numel()); + if (!use_global_beta_pow) { + // Cpu update + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow->data()[0]; + } + } else { + AdamWKernelMEM<<>>( + beta1, beta2, epsilon, coeff, beta1_pow->data(), + beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad->data(), param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, param->numel()); + if (!use_global_beta_pow) { + // Update with gpu + UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1, beta2, beta1_pow->data(), + beta2_pow->data(), + beta1_pow_out->mutable_data(ctx.GetPlace()), + beta2_pow_out->mutable_data(ctx.GetPlace())); + } + } + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); + if (grad->rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad->rows().begin(), grad->rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + framework::SelectedRows tmp_grad_merge; + const framework::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + scatter::MergeAdd merge_func; + merge_func(ctx.template device_context(), + *grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + if (beta1_pow->place() == platform::CPUPlace() && + beta2_pow->place() == platform::CPUPlace()) { + int threads = 512; + int ndim = param->numel(); + int blocks = (ndim + threads - 1) / threads; + + SparseAdamWCUDAKernelREG< + T, MPDType><<>>( + beta1, beta2, epsilon, coeff, *beta1_pow->data(), + *beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad_data, param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, rows, row_numel, grad_merge.rows().size(), + lazy_mode, ndim); + if (!use_global_beta_pow) { + // Update with cpu + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow->data()[0]; + } + } else { + SparseAdamWFunctor functor( + beta1, beta2, epsilon, coeff, beta1_pow->data(), + beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad_data, param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, rows, row_numel, grad_merge.rows().size(), + lazy_mode); + + // FIXME(minqiyang): remove BinarySearch in GPU later + platform::ForRange for_range( + static_cast( + ctx.device_context()), + param->numel()); + for_range(functor); + if (!use_global_beta_pow) { + // update beta1 and beta2 + UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1, beta2, beta1_pow->data(), + beta2_pow->data(), + beta1_pow_out->mutable_data(ctx.GetPlace()), + beta2_pow_out->mutable_data(ctx.GetPlace())); + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Variable type not supported by adamw_op")); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(adamw, ops::AdamWOpCUDAKernel, + ops::AdamWOpCUDAKernel, + ops::AdamWOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/adamw_op.h b/paddle/fluid/operators/optimizers/adamw_op.h index 3301bc4808e..d87f827bbf2 100644 --- a/paddle/fluid/operators/optimizers/adamw_op.h +++ b/paddle/fluid/operators/optimizers/adamw_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -22,6 +22,7 @@ class AdamWOp : public AdamOp { using AdamOp::AdamOp; }; +struct GPUAdamW; struct CPUAdamW; template @@ -46,6 +47,107 @@ class AdamWFunctor { } }; +template +class SparseAdamWFunctor; + +template +class SparseAdamWFunctor { + private: + MT beta1_; + MT beta2_; + MT epsilon_; + MT coeff_; + + const MT* beta1_pow_; + const MT* beta2_pow_; + const MT* moment1_; + MT* moment1_out_; + const MT* moment2_; + MT* moment2_out_; + const MT* lr_; + const T* grad_; + const T* param_; + T* param_out_; + const MT* master_param_; + MT* master_param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + bool lazy_mode_; + + public: + SparseAdamWFunctor(MT beta1, MT beta2, MT epsilon, MT coeff, + const MT* beta1_pow, const MT* beta2_pow, const MT* mom1, + MT* mom1_out, const MT* mom2, MT* mom2_out, const MT* lr, + const T* grad, const T* param, T* param_out, + const MT* master_param, MT* master_param_out, + const int64_t* rows, int64_t row_numel, int64_t row_count, + bool lazy_mode) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + coeff_(coeff), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + master_param_(master_param), + master_param_out_(master_param_out), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count), + lazy_mode_(lazy_mode) {} + + inline HOSTDEVICE void adamw_update(size_t i, MT g) const { + // The following code is the same as dense + MT mom1 = moment1_[i]; + MT mom2 = moment2_[i]; + MT lr = *lr_; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + MT p = master_param_ ? master_param_[i] : static_cast(param_[i]); + + // Calculation + MT wd = static_cast(1.0) - coeff_ * lr; + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + mom1 = beta1_ * mom1 + (static_cast(1.0) - beta1_) * g; + mom2 = beta2_ * mom2 + (static_cast(1.0) - beta2_) * g * g; + p = wd * p - + lr * (mom1 / + (sqrt(mom2) + epsilon_ * sqrt(static_cast(1.0) - beta2_pow))); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = static_cast(p); + if (master_param_out_) { + master_param_out_[i] = p; + } + } + + inline HOSTDEVICE void operator()(size_t i) const { + auto row_idx = + math::BinarySearch(rows_, row_count_, i / row_numel_); + if (lazy_mode_ && row_idx < 0) { + return; + } else { + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel_ + i % row_numel_]) + : static_cast(0); + adamw_update(i, g); + } + } +}; + template class AdamWOpKernel : public AdamOpKernel { public: diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index dc27befd26c..4b610f3bccb 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -118,6 +118,8 @@ std::map> op_passing_outs_map = { {"sgd", {"ParamOut"}}, {"adam", {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, + {"adamw", + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"average_accumulates", {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", "out_old_num_accumulates", "out_num_updates"}}, diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index fdc3682f4b1..d99e15b2128 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -93,33 +93,6 @@ class TestAdamWOp(unittest.TestCase): adam = paddle.optimizer.AdamW( 0.1, epsilon=-1, parameters=linear.parameters()) - def test_adamw_lr_decay(self): - paddle.disable_static() - value = np.arange(26).reshape(2, 13).astype("float32") - a = paddle.to_tensor(value) - linear = paddle.nn.Linear(13, 5) - - lr = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=10) - wd = 0.1 - adam = paddle.optimizer.AdamW( - learning_rate=lr, - parameters=linear.parameters(), - apply_decay_param_fun=lambda name: True, - weight_decay=wd) - - for _ in range(2): - out = linear(a) - out.backward() - lr_to_coeff = adam._lr_to_coeff - adam.step() - - for i, value in enumerate(lr_to_coeff.values()): - self.assertAlmostEqual(value.numpy()[0], 1.0 - lr() * wd) - self.assertEqual(len(adam._lr_to_coeff), 0) - - lr.step() - adam.clear_gradients() - class TestAdamWOpGroup(TestAdamWOp): def test_adamw_op_dygraph(self): diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 96578590897..e6ec91dc415 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -48,8 +48,8 @@ class AdamW(Adam): Args: learning_rate (float|LRScheduler, optional): The learning rate used to update ``Parameter``. It can be a float value or a LRScheduler. The default value is 0.001. - parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \ - This parameter is required in dygraph mode. And you can specify different options for \ + parameters (list|tuple, optional): List/Tuple of ``Tensor`` names to update to minimize ``loss``. \ + This parameter is required in dygraph mode. And you can specify different options for \ different parameter groups such as the learning rate, weight decay, etc, \ then the parameters are list of dict. Note that the learning_rate in paramter groups \ represents the scale of base learning_rate. \ @@ -162,7 +162,6 @@ class AdamW(Adam): self._params_name = set() self._apply_decay_param_fun = apply_decay_param_fun self._coeff = coeff - self._lr_to_coeff = dict() super(AdamW, self).__init__( learning_rate=learning_rate, @@ -178,9 +177,6 @@ class AdamW(Adam): self.type = "adamw" - # now the adamw op doesn't support cuda - if core.is_compiled_with_cuda(): - self.type = "adam" # Use _auxiliary_vars together with _set_auxiliary_var/_get_auxiliary_var to achieve that. self._auxiliary_vars = dict() @@ -193,64 +189,7 @@ class AdamW(Adam): else: return None - def _append_decoupled_weight_decay(self, block, param_and_grad): - """ - Add decoupled weight decay op. - parameter = parameter - parameter * coeff * lr - - Args: - block: block in which variable is to be created - param_and_grad: (parameters, gradients) pairs, - the parameters need to decay. - Raises: - Exception: The type of coeff and parameter is not consistent. - """ - if isinstance(param_and_grad, dict): - param_and_grad = self._update_param_group(param_and_grad) - param, grad = param_and_grad - - if self._apply_decay_param_fun is not None \ - and not self._apply_decay_param_fun(param.name): - return - - if isinstance(self._learning_rate, float): - learning_rate = self._learning_rate - else: - # NOTE. We add this function to the _append_optimize_op(), - # for we must make sure _create_param_lr() be called after - # optimizer._create_global_learning_rate(). - learning_rate = self._create_param_lr(param_and_grad) - - with block.program._optimized_guard( - [param, grad]), framework.name_scope('weight decay'): - self._params_name.add(param.name) - - # If it has been calculated, the result will be reused. - # NOTE(wangxi): In dygraph mode, apply_gradient will be executed - # every step, so need clear _lr_to_coeff every step, - # we do this in _create_optimization_pass - decay_coeff = self._lr_to_coeff.get(learning_rate, None) - if decay_coeff is None: - # NOTE(wangxi): for pipeline to set device:all - with paddle.static.device_guard(None): - decay_coeff = 1.0 - learning_rate * self._coeff - self._lr_to_coeff[learning_rate] = decay_coeff - - find_master = (self._multi_precision and - param.dtype == core.VarDesc.VarType.FP16) - if find_master: - master_weight = self._master_weights[param.name] - scaled_param = master_weight * decay_coeff - paddle.fluid.layers.assign( - input=scaled_param, output=master_weight) - else: - scaled_param = param * decay_coeff - paddle.fluid.layers.assign(input=scaled_param, output=param) - def _append_optimize_op(self, block, param_and_grad): - if not core.is_compiled_with_npu(): - self._append_decoupled_weight_decay(block, param_and_grad) - return super(AdamW, self)._append_optimize_op(block, param_and_grad) assert isinstance(block, framework.Block) if isinstance(param_and_grad, dict): @@ -262,6 +201,8 @@ class AdamW(Adam): if self._apply_decay_param_fun is not None \ and not self._apply_decay_param_fun(param.name): with_decay = False + else: + self._params_name.add(param.name) moment1 = self._get_accumulator(self._moment1_acc_str, param_and_grad[0]) @@ -277,19 +218,19 @@ class AdamW(Adam): if find_master else None) lr = self._create_param_lr(param_and_grad) - # create the adam optimize op + # create the adamw optimize op if framework.in_dygraph_mode(): _beta1 = self._beta1 if not isinstance( self._beta1, Variable) else self._beta1.numpy().item(0) _beta2 = self._beta2 if not isinstance( self._beta2, Variable) else self._beta2.numpy().item(0) - _, _, _, _, _ = _C_ops.adam( + _, _, _, _, _ = _C_ops.adamw( param_and_grad[0], param_and_grad[1], lr, moment1, moment2, beta1_pow_acc, beta2_pow_acc, param_and_grad[0], moment1, moment2, beta1_pow_acc, beta2_pow_acc, 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', - 1000, 'beta1', _beta1, 'beta2', _beta2) + 1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff) return None @@ -350,13 +291,6 @@ class AdamW(Adam): return adamw_op - def _create_optimization_pass(self, parameters_and_grads): - optimize_ops = super( - AdamW, self)._create_optimization_pass(parameters_and_grads) - # In dygraph mode, clear _lr_to_coeff after applied gradient - self._lr_to_coeff = dict() - return optimize_ops - def __str__(self): return " ".join(["Weight Decay, params:", ",".join(self._params_name)]) -- GitLab