diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index e6577f662ae7b22fb7078ab5aa697c8a3da5feb2..a3434dfd6cc4c592734cd975eefb2a39b97c15ab 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -58,6 +58,8 @@ class DenseTensor; DECLARE_bool(benchmark); DECLARE_bool(check_nan_inf); DECLARE_bool(enable_unused_var_check); +PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0, + "number of threads for inner op"); DECLARE_bool(run_kp_kernel); DECLARE_bool(enable_host_event_recorder_hook); diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 8225dc8e07d6a281bc5059f4720debb89e5c40fe..bcb508cd37d6804ccf5648519eecb1c07e3be6ac 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -12,41 +12,125 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/optimizers/adam_op.h" #include "paddle/fluid/framework/op_version_registry.h" - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/multiary.h" +#include "paddle/fluid/operators/optimizers/adamw_op.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -class AdamOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; +void AdamOp::InferShape(framework::InferShapeContext *ctx) const { + PADDLE_ENFORCE_EQ( + ctx->HasInput("Param"), true, + platform::errors::NotFound("Input(Param) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Grad"), true, + platform::errors::NotFound("Input(Grad) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Moment1"), true, + platform::errors::NotFound( + "Input(Moment1) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Moment2"), true, + platform::errors::NotFound( + "Input(Moment2) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true, + platform::errors::NotFound( + "Input(LearningRate) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Beta1Pow"), true, + platform::errors::NotFound( + "Input(Beta1Pow) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Beta2Pow"), true, + platform::errors::NotFound( + "Input(Beta2Pow) of AdamOp should not be null.")); + + PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true, + platform::errors::NotFound( + "Output(ParamOut) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment1Out"), true, + platform::errors::NotFound( + "Output(Moment1Out) of AdamOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment2Out"), true, + platform::errors::NotFound( + "Output(Moment2Out) of AdamOp should not be null.")); - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const { - auto input_data_type = - OperatorWithKernel::IndicateVarDataType(ctx, "Param"); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_NE( + phi::product(lr_dims), 0, + platform::errors::InvalidArgument( + "The number of LearningRate shall not be 0, but received %d. Maybe " + "the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.", + phi::product(lr_dims))); + PADDLE_ENFORCE_EQ( + phi::product(lr_dims), 1, + platform::errors::InvalidArgument( + "Learning rate should have 1 dimension, but received %d", + phi::product(lr_dims))); + auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow"); + VLOG(3) << "dims of Beta1Pow : [" << beta1_pow_dims << "]"; + PADDLE_ENFORCE_GE(phi::product(beta1_pow_dims), 1, + platform::errors::InvalidArgument( + "The size of Beta1 power accumulator should be greater " + "than 0, but received %d.", + phi::product(beta1_pow_dims))); + auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow"); + VLOG(3) << "dims of Beta2Pow : [" << beta2_pow_dims << "]"; + PADDLE_ENFORCE_GE(phi::product(beta2_pow_dims), 1, + platform::errors::InvalidArgument( + "The size of Beta2 power accumulator should be greater " + "than 0, but received %d.", + phi::product(beta2_pow_dims))); + + auto param_dims = ctx->GetInputDim("Param"); + if (ctx->GetInputsVarType("Grad")[0] == + framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + platform::errors::InvalidArgument( + "Param and Grad input of AdamOp should have same dimension. But " + "received Param dims: [%s], Grad dims: [%s].", + param_dims, ctx->GetInputDim("Grad"))); } + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment1"), + platform::errors::InvalidArgument( + "Param and Moment1 input of AdamOp should have same dimension. But " + "received Param dims: [%s], Moment1 dims: [%s].", + param_dims, ctx->GetInputDim("Moment1"))); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Moment2"), + platform::errors::InvalidArgument( + "Param and Moment2 input of AdamOp should have same dimension. But " + "received Param dims: [%s], Moment2 dims: [%s].", + param_dims, ctx->GetInputDim("Moment2"))); + + ctx->SetOutputDim("ParamOut", param_dims); + ctx->SetOutputDim("Moment1Out", param_dims); + ctx->SetOutputDim("Moment2Out", param_dims); + ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims); + ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims); +} - framework::OpKernelType GetKernelTypeForVar( - const std::string &var_name, const framework::Tensor &tensor, - const framework::OpKernelType &expected_kernel_type) const { - if (var_name == "Beta1Pow" || var_name == "Beta2Pow" || - var_name == "SkipUpdate") { - return expected_kernel_type; - } else { - return framework::OpKernelType(expected_kernel_type.data_type_, - tensor.place(), tensor.layout()); - } +framework::OpKernelType AdamOp::GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Param"); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); +} + +framework::OpKernelType AdamOp::GetKernelTypeForVar( + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + if (var_name == "Beta1Pow" || var_name == "Beta2Pow" || + var_name == "SkipUpdate") { + return expected_kernel_type; + } else { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); } -}; +} class AdamOpMaker : public framework::OpProtoAndCheckerMaker { public: @@ -148,10 +232,6 @@ $$ } }; -class AdamWOp : public AdamOp { - using AdamOp::AdamOp; -}; - class AdamWOpMaker : public AdamOpMaker { public: void Make() { @@ -175,23 +255,13 @@ class AdamWOpMaker : public AdamOpMaker { } // namespace paddle namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker); + +REGISTER_OP_WITHOUT_GRADIENT(adamw, ops::AdamWOp, ops::AdamWOpMaker); -DECLARE_INFER_SHAPE_FUNCTOR(adam, AdamInferMetaFunctor, - PD_INFER_META(phi::AdamInferMeta)); - -REGISTER_OPERATOR( - adam, ops::AdamOp, ops::AdamOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - AdamInferMetaFunctor); - -DECLARE_INFER_SHAPE_FUNCTOR(adamw, AdamwInferMetaFunctor, - PD_INFER_META(phi::AdamwInferMeta)); -REGISTER_OPERATOR( - adamw, ops::AdamWOp, ops::AdamWOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker, - AdamwInferMetaFunctor); +REGISTER_OP_CPU_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); REGISTER_OP_VERSION(adam) .AddCheckpoint( diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..c1aa392d8a528d248d07fb9654e45e3006e79139 --- /dev/null +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -0,0 +1,420 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/optimizers/adam_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +__global__ void AdamKernelREG(MT beta1, MT beta2, MT epsilon, MT beta1_pow_, + MT beta2_pow_, const MT* moment1, MT* moment1_out, + const MT* moment2, MT* moment2_out, const MT* lr_, + const T* grad, const T* param, T* param_out, + const MT* master_param, MT* master_param_out, + int ndim) { + MT lr = *lr_; + MT beta1_pow = beta1_pow_; + MT beta2_pow = beta2_pow_; + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = static_cast(moment1[id]); + MT mom2 = static_cast(moment2[id]); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} + +template +__global__ void AdamKernelMEM(MT beta1, MT beta2, MT epsilon, + const MT* beta1_pow_, const MT* beta2_pow_, + const MT* moment1, MT* moment1_out, + const MT* moment2, MT* moment2_out, const MT* lr_, + const T* grad, const T* param, T* param_out, + const MT* master_param, MT* master_param_out, + int ndim) { + MT lr = *lr_; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = static_cast(moment1[id]); + MT mom2 = static_cast(moment2[id]); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} +template +__global__ void UpdateBetaPow(T beta1, T beta2, const T* beta1_pow_, + const T* beta2_pow_, T* beta1_pow_out, + T* beta2_pow_out) { + *beta1_pow_out = beta1 * beta1_pow_[0]; + *beta2_pow_out = beta2 * beta2_pow_[0]; +} + +template +__global__ void SparseAdamCUDAKernelREG( + MT beta1, MT beta2, MT epsilon, const MT beta1_pow, const MT beta2_pow, + const MT* mom1_, MT* mom1_out_, const MT* mom2_, MT* mom2_out_, + const MT* lr_, const T* grad_, const T* param_, T* param_out_, + const MT* master_param, MT* master_param_out, const int64_t* rows_, + int64_t row_numel, int64_t row_count, bool lazy_mode, int ndim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + MT lr = *lr_; + + for (; id < ndim; id += blockDim.x * gridDim.x) { + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count, id / row_numel); + if (lazy_mode && row_idx < 0) { + return; + } else { + MT mom1 = mom1_[id]; + MT mom2 = mom2_[id]; + MT p = master_param ? master_param[id] : static_cast(param_[id]); + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel + id % row_numel]) + : static_cast(0); + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = + (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + // Write back to global memory + mom1_out_[id] = mom1; + mom2_out_[id] = mom2; + param_out_[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } + } +} + +template +class AdamOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + + using paddle::framework::LoDTensor; + using MPDType = typename details::MPTypeTrait::Type; + + int64_t min_row_size_to_use_multithread = + ctx.Attr("min_row_size_to_use_multithread"); + bool lazy_mode = ctx.Attr("lazy_mode"); + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + auto* param = ctx.Input("Param"); + auto* grad_var = ctx.InputVar("Grad"); + auto* mom1 = ctx.Input("Moment1"); + auto* mom2 = ctx.Input("Moment2"); + auto* lr = ctx.Input("LearningRate"); + + auto* beta1_pow = ctx.Input("Beta1Pow"); + auto* beta2_pow = ctx.Input("Beta2Pow"); + + auto* param_out = ctx.Output("ParamOut"); + auto* mom1_out = ctx.Output("Moment1Out"); + auto* mom2_out = ctx.Output("Moment2Out"); + auto* beta1_pow_out = ctx.Output("Beta1PowOut"); + auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); + skip_update = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adam skip update"; + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); + framework::TensorCopy( + *beta1_pow, beta1_pow->place(), + ctx.template device_context(), + beta1_pow_out); + framework::TensorCopy( + *beta2_pow, beta2_pow->place(), + ctx.template device_context(), + beta2_pow_out); + return; + } + + MPDType beta1 = static_cast(ctx.Attr("beta1")); + if (ctx.HasInput("Beta1Tensor")) { + auto* beta1_tensor = ctx.Input("Beta1Tensor"); + PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta1Tensor) size must be 1, but get %d", + beta1_tensor->numel())); + beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + } + MPDType beta2 = static_cast(ctx.Attr("beta2")); + if (ctx.HasInput("Beta2Tensor")) { + auto* beta2_tensor = ctx.Input("Beta2Tensor"); + PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta2Tensor) size must be 1, but get %d", + beta2_tensor->numel())); + beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + } + MPDType epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } + VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() + << "beta2_pow.numel() : " << beta2_pow->numel(); + VLOG(3) << "param.numel(): " << param->numel(); + PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const bool multi_precision = ctx.Attr("multi_precision"); + const LoDTensor* master_param = nullptr; + LoDTensor* master_param_out = nullptr; + if (multi_precision) { + bool has_master = + ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); + PADDLE_ENFORCE_EQ(has_master, true, + platform::errors::InvalidArgument( + "The Input(MasterParam) and Output(MasterParamOut) " + "should not be null when " + "the attr `multi_precision` is true")); + master_param = ctx.Input("MasterParam"); + master_param_out = ctx.Output("MasterParamOut"); + } + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision + ? master_param_out->mutable_data(ctx.GetPlace()) + : nullptr; + + auto& dev_ctx = ctx.template device_context(); + + if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); + + // update param and moment + int threads = 512; + int blocks = (param->numel() + threads - 1) / threads; + + if (beta1_pow->place() == platform::CPUPlace() && + beta2_pow->place() == platform::CPUPlace()) { + // Compute with betapow in REG + AdamKernelREG<<>>( + beta1, beta2, epsilon, *beta1_pow->data(), + *beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad->data(), param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, param->numel()); + if (!use_global_beta_pow) { + // Cpu update + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow->data()[0]; + } + } else { + AdamKernelMEM<<>>( + beta1, beta2, epsilon, beta1_pow->data(), + beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad->data(), param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, param->numel()); + if (!use_global_beta_pow) { + // Update with gpu + UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1, beta2, beta1_pow->data(), + beta2_pow->data(), + beta1_pow_out->mutable_data(ctx.GetPlace()), + beta2_pow_out->mutable_data(ctx.GetPlace())); + } + } + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); + if (grad->rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad->rows().begin(), grad->rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + phi::SelectedRows tmp_grad_merge; + const phi::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + scatter::MergeAdd merge_func; + merge_func(ctx.template device_context(), + *grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows( + grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + if (beta1_pow->place() == platform::CPUPlace() && + beta2_pow->place() == platform::CPUPlace()) { + int threads = 512; + int ndim = param->numel(); + int blocks = (ndim + threads - 1) / threads; + + SparseAdamCUDAKernelREG< + T, MPDType><<>>( + beta1, beta2, epsilon, *beta1_pow->data(), + *beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad_data, param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, rows, row_numel, grad_merge.rows().size(), + lazy_mode, ndim); + if (!use_global_beta_pow) { + // Update with cpu + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow->data()[0]; + } + } else { + SparseAdamFunctor functor( + beta1, beta2, epsilon, beta1_pow->data(), + beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad_data, param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, rows, row_numel, grad_merge.rows().size(), + lazy_mode); + + // FIXME(minqiyang): remove BinarySearch in GPU later + platform::ForRange for_range( + static_cast( + ctx.device_context()), + param->numel()); + for_range(functor); + if (!use_global_beta_pow) { + // update beta1 and beta2 + UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1, beta2, beta1_pow->data(), + beta2_pow->data(), + beta1_pow_out->mutable_data(ctx.GetPlace()), + beta2_pow_out->mutable_data(ctx.GetPlace())); + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Variable type not supported by adam_op")); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(adam, ops::AdamOpCUDAKernel, + ops::AdamOpCUDAKernel, + ops::AdamOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h new file mode 100644 index 0000000000000000000000000000000000000000..decab04f1ca261a828dd749cefbdbaf9f5cfac79 --- /dev/null +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -0,0 +1,695 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include // for sqrt in CPU and CUDA +#include +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/kernels/funcs/algorithm.h" + +namespace paddle { +namespace operators { + +namespace scatter = paddle::operators::math::scatter; + +static inline float GetAttrFromTensor(const framework::Tensor* tensor) { + const float* tensor_data = tensor->data(); + framework::Tensor cpu_tensor; + if (platform::is_gpu_place(tensor->place())) { + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), + &cpu_tensor); + tensor_data = cpu_tensor.data(); + } + if (platform::is_xpu_place(tensor->place())) { + paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), + &cpu_tensor); + tensor_data = cpu_tensor.data(); + } + return tensor_data[0]; +} + +class AdamOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override; + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override; +}; + +struct GPUAdam; +struct CPUAdam; + +template +class AdamFunctor; + +template +class AdamFunctor { + private: + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + public: + AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, + T* mom2_out, const T* lr, const T* grad, const T* param, + T* param_out) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out) {} + + inline HOSTDEVICE void operator()(size_t i) const { + // Merge all memory access together. + T g = grad_[i]; + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + T p = param_[i]; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = p; + } +}; + +template +class AdamFunctor { + private: + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + public: + AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, + T* mom2_out, const T* lr, const T* grad, const T* param, + T* param_out) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out) {} + + void operator()(size_t numel) const { + Eigen::Map> g{ + grad_, static_cast(numel)}; + Eigen::Map> mom1{ + moment1_, static_cast(numel)}; + Eigen::Map> mom2{ + moment2_, static_cast(numel)}; + Eigen::Map> param{ + param_, static_cast(numel)}; + + Eigen::Map> param_out{ + param_out_, static_cast(numel)}; + Eigen::Map> moment1_out{ + moment1_out_, static_cast(numel)}; + Eigen::Map> moment2_out{ + moment2_out_, static_cast(numel)}; + + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + + moment1_out = beta1_ * mom1 + (1 - beta1_) * g; + moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g; + param_out = param - + lr * (moment1_out / + (moment2_out.sqrt() + epsilon_ * sqrt(1 - beta2_pow))); + } +}; + +template +class SparseAdamFunctor; + +template +class SparseAdamFunctor { + private: + MT beta1_; + MT beta2_; + MT epsilon_; + + const MT* beta1_pow_; + const MT* beta2_pow_; + const MT* moment1_; + MT* moment1_out_; + const MT* moment2_; + MT* moment2_out_; + const MT* lr_; + const T* grad_; + const T* param_; + T* param_out_; + const MT* master_param_; + MT* master_param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + bool lazy_mode_; + + public: + SparseAdamFunctor(MT beta1, MT beta2, MT epsilon, const MT* beta1_pow, + const MT* beta2_pow, const MT* mom1, MT* mom1_out, + const MT* mom2, MT* mom2_out, const MT* lr, const T* grad, + const T* param, T* param_out, const MT* master_param, + MT* master_param_out, const int64_t* rows, + int64_t row_numel, int64_t row_count, bool lazy_mode) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + master_param_(master_param), + master_param_out_(master_param_out), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count), + lazy_mode_(lazy_mode) {} + + inline HOSTDEVICE void adam_update(size_t i, MT g) const { + // The following code is the same as dense + MT mom1 = moment1_[i]; + MT mom2 = moment2_[i]; + MT lr = *lr_; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + MT p = master_param_ ? master_param_[i] : static_cast(param_[i]); + + // Calculation + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + mom1 = beta1_ * mom1 + (static_cast(1.0) - beta1_) * g; + mom2 = beta2_ * mom2 + (static_cast(1.0) - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + + epsilon_ * sqrt(static_cast(1.0) - beta2_pow))); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = static_cast(p); + if (master_param_out_) { + master_param_out_[i] = p; + } + } + + inline HOSTDEVICE void operator()(size_t i) const { + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); + if (lazy_mode_ && row_idx < 0) { + return; + } else { + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel_ + i % row_numel_]) + : static_cast(0); + adam_update(i, g); + } + } +}; + +template +class SparseAdamFunctor { + private: + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + T* param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + + public: + SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, + const T* mom2, T* mom2_out, const T* lr, const T* grad, + const T* param, T* param_out, const int64_t* rows, + int64_t row_numel, int64_t row_count, bool lazy_mode) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count) {} + + inline HOSTDEVICE void adam_update(size_t i, T g) const { + // The following code is the same as dense + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + T p = param_[i]; + + // Calculation + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = p; + } + + inline void operator()(size_t numel) const { + // lr could be reuse + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); + int64_t row_count = static_cast(numel / row_numel_); + + for (int64_t i = 0, j = 0; i != row_count; ++i) { + if (i == *(rows_ + j)) { + for (int64_t k = 0; k != row_numel_; ++k) { + T g = grad_[j * row_numel_ + k]; + adam_update(i * row_numel_ + k, g); + } + ++j; + } else { + for (int64_t k = 0; k != row_numel_; ++k) { + T mom1 = moment1_[i * row_numel_ + k]; + T mom2 = moment2_[i * row_numel_ + k]; + T p = param_[i * row_numel_ + k]; + + mom1 = beta1_ * mom1; + mom2 = beta2_ * mom2; + + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); + // Write back to global memory + moment1_out_[i * row_numel_ + k] = mom1; + moment2_out_[i * row_numel_ + k] = mom2; + param_out_[i * row_numel_ + k] = p; + } + } + } + } +}; + +template +class AdamOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + + using paddle::framework::LoDTensor; + + int64_t min_row_size_to_use_multithread = + ctx.Attr("min_row_size_to_use_multithread"); + bool lazy_mode = ctx.Attr("lazy_mode"); + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + auto* param = ctx.Input("Param"); + auto* grad_var = ctx.InputVar("Grad"); + auto* mom1 = ctx.Input("Moment1"); + auto* mom2 = ctx.Input("Moment2"); + auto* lr = ctx.Input("LearningRate"); + auto* beta1_pow = ctx.Input("Beta1Pow"); + auto* beta2_pow = ctx.Input("Beta2Pow"); + + auto* param_out = ctx.Output("ParamOut"); + auto* mom1_out = ctx.Output("Moment1Out"); + auto* mom2_out = ctx.Output("Moment2Out"); + auto* beta1_pow_out = ctx.Output("Beta1PowOut"); + auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); + skip_update = skip_update_vec[0]; + } + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adam skip update"; + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); + framework::TensorCopy( + *beta1_pow, ctx.GetPlace(), + ctx.template device_context(), + beta1_pow_out); + framework::TensorCopy( + *beta2_pow, ctx.GetPlace(), + ctx.template device_context(), + beta2_pow_out); + return; + } + + T beta1 = static_cast(ctx.Attr("beta1")); + if (ctx.HasInput("Beta1Tensor")) { + auto* beta1_tensor = ctx.Input("Beta1Tensor"); + PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta1Tensor) size must be 1, but get %d", + beta1_tensor->numel())); + beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + } + T beta2 = static_cast(ctx.Attr("beta2")); + if (ctx.HasInput("Beta2Tensor")) { + auto* beta2_tensor = ctx.Input("Beta2Tensor"); + PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta2Tensor) size must be 1, but get %d", + beta2_tensor->numel())); + beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + } + T epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } + + VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() + << "beta2_pow.numel() : " << beta2_pow->numel(); + VLOG(3) << "param.numel(): " << param->numel(); + + PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + if (grad_var->IsType()) { + T beta1_p = beta1_pow->data()[0]; + T beta2_p = beta2_pow->data()[0]; + + if (!use_global_beta_pow) { + beta1_pow_out->mutable_data(ctx.GetPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(ctx.GetPlace())[0] = + beta2 * beta2_pow->data()[0]; + } + + auto* grad = ctx.Input("Grad"); + + T* param_out_ptr = param_out->mutable_data(ctx.GetPlace()); + T* mom1_out_ptr = mom1_out->mutable_data(ctx.GetPlace()); + T* mom2_out_ptr = mom2_out->mutable_data(ctx.GetPlace()); + + T learning_rate = lr->data()[0] * (sqrt(1 - beta2_p) / (1 - beta1_p)); + T eps = epsilon * sqrt(1 - beta2_p); + + jit::adam_attr_t attr(beta1, beta2); + int64_t numel = param->numel(); + + const T* param_ptr = param->data(); + const T* mom1_ptr = mom1->data(); + const T* mom2_ptr = mom2->data(); + const T* grad_ptr = grad->data(); + + auto adam = + jit::KernelFuncs, platform::CPUPlace>::Cache().At( + attr); + + static constexpr int64_t chunk_size = 512; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < numel / chunk_size; ++i) { + const int64_t offset = i * chunk_size; + adam(beta1, beta2, -learning_rate, eps, chunk_size, grad_ptr + offset, + mom1_ptr + offset, mom2_ptr + offset, param_ptr + offset, + mom1_out_ptr + offset, mom2_out_ptr + offset, + param_out_ptr + offset); + } + + if (numel % chunk_size != 0) { + const int64_t offset = (numel / chunk_size) * chunk_size; + const int64_t tail_numel = numel % chunk_size; + adam(beta1, beta2, -learning_rate, eps, tail_numel, grad_ptr + offset, + mom1_ptr + offset, mom2_ptr + offset, param_ptr + offset, + mom1_out_ptr + offset, mom2_out_ptr + offset, + param_out_ptr + offset); + } + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); + if (grad->rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad->rows().begin(), grad->rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + phi::SelectedRows tmp_grad_merge; + const phi::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + scatter::MergeAdd merge_func; + merge_func(ctx.template device_context(), *grad, + &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows( + grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + SparseAdamFunctor functor( + beta1, beta2, epsilon, beta1_pow->data(), beta2_pow->data(), + mom1->data(), mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad_data, param->data(), + param_out->mutable_data(ctx.GetPlace()), rows, row_numel, + grad_merge.rows().size(), lazy_mode); + // update beta1 and beta2 + if (!use_global_beta_pow) { + beta1_pow_out->mutable_data(ctx.GetPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(ctx.GetPlace())[0] = + beta2 * beta2_pow->data()[0]; + } + if (lazy_mode) { + VLOG(3) << "run cpu lazy mode"; + size_t row_count = grad_merge.rows().size(); + std::vector cpu_rows(grad_merge.rows()); + for (size_t row_index = 0; row_index < row_count; ++row_index) { + for (size_t offset = 0; offset < row_numel; ++offset) { + size_t i = cpu_rows[row_index] * row_numel + offset; + functor.adam_update(i, grad_data[row_index * row_numel + offset]); + } + } + } +#ifndef _WIN32 + else if (FLAGS_inner_op_parallelism > 1 && // NOLINT + min_row_size_to_use_multithread > 0 && + param->dims()[0] > min_row_size_to_use_multithread) { + VLOG(3) << "use multi thread, inner_op_parallelism=" + << FLAGS_inner_op_parallelism + << " min_row_size_to_use_multithread=" + << min_row_size_to_use_multithread; + if (FLAGS_inner_op_parallelism > 10) { + VLOG(1) << "FLAGS_inner_op_parallelism " << FLAGS_inner_op_parallelism + << " is two large!"; + } + auto& grad_rows = grad_merge.rows(); + std::unordered_map row_id_to_grad_row_offset; + size_t param_row_count = param->numel() / row_numel; + if (param_row_count < 1000) { + VLOG(1) << "param_row_count should be larger then 1000 to use " + "multi thread, currently " + << param_row_count; + } + for (size_t i = 0; i < grad_rows.size(); ++i) { + row_id_to_grad_row_offset[grad_rows[i]] = i; + } + std::vector> fs; + int64_t line_in_each_thread = + param_row_count / FLAGS_inner_op_parallelism + 1; + for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { + int64_t start = i * line_in_each_thread; + int64_t end = (i + 1) * line_in_each_thread; + if (start >= static_cast(param_row_count)) { + break; + } + if (end > static_cast(param_row_count)) { + end = static_cast(param_row_count); + } + fs.push_back(framework::Async([&functor, &row_id_to_grad_row_offset, + &grad_data, row_numel, start, end]() { + for (int64_t row_id = start; row_id < end; ++row_id) { + auto iter = row_id_to_grad_row_offset.find(row_id); + if (iter != row_id_to_grad_row_offset.end()) { + for (size_t row_offset = 0U; row_offset < row_numel; + ++row_offset) { + functor.adam_update( + row_id * row_numel + row_offset, + grad_data[iter->second * row_numel + row_offset]); + } + } else { + for (size_t row_offset = 0U; row_offset < row_numel; + ++row_offset) { + functor.adam_update(row_id * row_numel + row_offset, 0); + } + } + } + })); + } + for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); + } +#endif // !_WIN32 + else { // NOLINT + functor(param->numel()); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Variable type not supported by adam_op")); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adam_op_functor.h b/paddle/fluid/operators/optimizers/adam_op_functor.h deleted file mode 100644 index e2c1c9abd6dcb0d8447e4d5036acd38913f4e526..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/optimizers/adam_op_functor.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" - -namespace paddle { -namespace operators { - -namespace scatter = paddle::operators::math::scatter; - -static inline float GetAttrFromTensor(const framework::Tensor* tensor) { - const float* tensor_data = tensor->data(); - framework::Tensor cpu_tensor; - if (platform::is_gpu_place(tensor->place())) { - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), - &cpu_tensor); - tensor_data = cpu_tensor.data(); - } - if (platform::is_xpu_place(tensor->place())) { - paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(), - &cpu_tensor); - tensor_data = cpu_tensor.data(); - } - return tensor_data[0]; -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc index 1ea91f6ebfa3e59ca2a6af2fea15a24b8a32e4e0..56c5d48b9f497f12b4cebea69b5e05957caf7891 100644 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc @@ -15,8 +15,8 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/optimizers/adam_op.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index 6ea0b2054cdea6a11b29e0d1f8c37a7472bb55ec..6c47b3906e062eabdd65f724b3ec2e6f9d38d4e7 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/optimizers/adam_op.h" #include "gflags/gflags.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/optimizers/adam_op_functor.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/adamw_op.cc b/paddle/fluid/operators/optimizers/adamw_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c2111d53f3a45fedec31ff1abcd9263a26145b98 --- /dev/null +++ b/paddle/fluid/operators/optimizers/adamw_op.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +namespace ops = paddle::operators; +REGISTER_OP_CPU_KERNEL( + adamw, ops::AdamWOpKernel, + ops::AdamWOpKernel); diff --git a/paddle/fluid/operators/optimizers/adamw_op.cu b/paddle/fluid/operators/optimizers/adamw_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..1d61bdec26d581278758f39293e600598624435f --- /dev/null +++ b/paddle/fluid/operators/optimizers/adamw_op.cu @@ -0,0 +1,443 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/optimizers/adamw_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +__global__ void AdamWKernelREG(MT beta1, MT beta2, MT epsilon, MT coeff, + MT lr_ratio, MT beta1_pow_, MT beta2_pow_, + const MT* moment1, MT* moment1_out, + const MT* moment2, MT* moment2_out, + const MT* lr_, const T* grad, const T* param, + T* param_out, const MT* master_param, + MT* master_param_out, int ndim) { + MT lr = *lr_ * lr_ratio; + MT beta1_pow = beta1_pow_; + MT beta2_pow = beta2_pow_; + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = static_cast(moment1[id]); + MT mom2 = static_cast(moment2[id]); + + p *= (static_cast(1.0) - lr * coeff); + + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} + +template +__global__ void AdamWKernelMEM( + MT beta1, MT beta2, MT epsilon, MT coeff, MT lr_ratio, const MT* beta1_pow_, + const MT* beta2_pow_, const MT* moment1, MT* moment1_out, const MT* moment2, + MT* moment2_out, const MT* lr_, const T* grad, const T* param, T* param_out, + const MT* master_param, MT* master_param_out, int ndim) { + MT lr = *lr_ * lr_ratio; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + + int id = blockIdx.x * blockDim.x + threadIdx.x; + + for (; id < ndim; id += gridDim.x * blockDim.x) { + MT p = master_param ? master_param[id] : static_cast(param[id]); + MT g = static_cast(grad[id]); + MT mom1 = static_cast(moment1[id]); + MT mom2 = static_cast(moment2[id]); + + p *= (static_cast(1.0) - lr * coeff); + + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + moment1_out[id] = mom1; + moment2_out[id] = mom2; + param_out[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } +} +template +__global__ void UpdateAdamWBetaPow(T beta1, T beta2, const T* beta1_pow_, + const T* beta2_pow_, T* beta1_pow_out, + T* beta2_pow_out) { + *beta1_pow_out = beta1 * beta1_pow_[0]; + *beta2_pow_out = beta2 * beta2_pow_[0]; +} + +template +__global__ void SparseAdamWCUDAKernelREG( + MT beta1, MT beta2, MT epsilon, MT coeff, MT lr_ratio, const MT beta1_pow, + const MT beta2_pow, const MT* mom1_, MT* mom1_out_, const MT* mom2_, + MT* mom2_out_, const MT* lr_, const T* grad_, const T* param_, + T* param_out_, const MT* master_param, MT* master_param_out, + const int64_t* rows_, int64_t row_numel, int64_t row_count, bool lazy_mode, + int ndim) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + MT lr = *lr_ * lr_ratio; + + for (; id < ndim; id += blockDim.x * gridDim.x) { + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count, id / row_numel); + if (lazy_mode && row_idx < 0) { + return; + } else { + MT mom1 = static_cast(mom1_[id]); + MT mom2 = static_cast(mom2_[id]); + + MT p = master_param ? master_param[id] : static_cast(param_[id]); + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel + id % row_numel]) + : static_cast(0); + + p *= (static_cast(1.0) - lr * coeff); + + mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; + mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; + + MT denom = + (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; + + p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); + + // Write back to global memory + mom1_out_[id] = mom1; + mom2_out_[id] = mom2; + param_out_[id] = static_cast(p); + if (master_param_out) { + master_param_out[id] = p; + } + } + } +} + +template +class AdamWOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + + using paddle::framework::LoDTensor; + using MPDType = typename details::MPTypeTrait::Type; + + int64_t min_row_size_to_use_multithread = + ctx.Attr("min_row_size_to_use_multithread"); + bool lazy_mode = ctx.Attr("lazy_mode"); + bool use_global_beta_pow = ctx.Attr("use_global_beta_pow"); + VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; + + MPDType coeff = static_cast(ctx.Attr("coeff")); + MPDType lr_ratio = static_cast(ctx.Attr("lr_ratio")); + + auto* param = ctx.Input("Param"); + auto* grad_var = ctx.InputVar("Grad"); + auto* mom1 = ctx.Input("Moment1"); + auto* mom2 = ctx.Input("Moment2"); + auto* lr = ctx.Input("LearningRate"); + + auto* beta1_pow = ctx.Input("Beta1Pow"); + auto* beta2_pow = ctx.Input("Beta2Pow"); + + auto* param_out = ctx.Output("ParamOut"); + auto* mom1_out = ctx.Output("Moment1Out"); + auto* mom2_out = ctx.Output("Moment2Out"); + auto* beta1_pow_out = ctx.Output("Beta1PowOut"); + auto* beta2_pow_out = ctx.Output("Beta2PowOut"); + + bool skip_update = false; + if (ctx.HasInput("SkipUpdate")) { + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); + skip_update = skip_update_vec[0]; + } + + // skip_update=true, just copy input to output, and TensorCopy will call + // mutable_data + if (skip_update) { + VLOG(4) << "Adamw skip update"; + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); + framework::TensorCopy( + *beta1_pow, ctx.GetPlace(), + ctx.template device_context(), + beta1_pow_out); + framework::TensorCopy( + *beta2_pow, ctx.GetPlace(), + ctx.template device_context(), + beta2_pow_out); + return; + } + + // if with_decay = false, coeff = 0 + bool with_decay = ctx.Attr("with_decay"); + if (!with_decay) { + coeff = static_cast(0.0); + } + + MPDType beta1 = static_cast(ctx.Attr("beta1")); + if (ctx.HasInput("Beta1Tensor")) { + auto* beta1_tensor = ctx.Input("Beta1Tensor"); + PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta1Tensor) size must be 1, but get %d", + beta1_tensor->numel())); + beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + } + MPDType beta2 = static_cast(ctx.Attr("beta2")); + if (ctx.HasInput("Beta2Tensor")) { + auto* beta2_tensor = ctx.Input("Beta2Tensor"); + PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(Beta2Tensor) size must be 1, but get %d", + beta2_tensor->numel())); + beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + } + MPDType epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } + VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() + << "beta2_pow.numel() : " << beta2_pow->numel(); + VLOG(3) << "param.numel(): " << param->numel(); + PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta1 pow output size should be 1, but received " + "value is:%d.", + beta1_pow_out->numel())); + + PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1, + platform::errors::InvalidArgument( + "beta2 pow output size should be 1, but received " + "value is:%d.", + beta2_pow_out->numel())); + + const bool multi_precision = ctx.Attr("multi_precision"); + const LoDTensor* master_param = nullptr; + LoDTensor* master_param_out = nullptr; + if (multi_precision) { + bool has_master = + ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); + PADDLE_ENFORCE_EQ(has_master, true, + platform::errors::InvalidArgument( + "The Input(MasterParam) and Output(MasterParamOut) " + "should not be null when " + "the attr `multi_precision` is true")); + master_param = ctx.Input("MasterParam"); + master_param_out = ctx.Output("MasterParamOut"); + } + const MPDType* master_in_data = + multi_precision ? master_param->data() : nullptr; + MPDType* master_out_data = + multi_precision + ? master_param_out->mutable_data(ctx.GetPlace()) + : nullptr; + + auto& dev_ctx = ctx.template device_context(); + + if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); + + // update param and moment + int threads = 512; + int blocks = (param->numel() + threads - 1) / threads; + + if (beta1_pow->place() == platform::CPUPlace() && + beta2_pow->place() == platform::CPUPlace()) { + // Compute with betapow in REG + AdamWKernelREG<<>>( + beta1, beta2, epsilon, coeff, lr_ratio, *beta1_pow->data(), + *beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad->data(), param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, param->numel()); + if (!use_global_beta_pow) { + // Cpu update + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow->data()[0]; + } + } else { + AdamWKernelMEM<<>>( + beta1, beta2, epsilon, coeff, lr_ratio, beta1_pow->data(), + beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad->data(), param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, param->numel()); + if (!use_global_beta_pow) { + // Update with gpu + UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1, beta2, beta1_pow->data(), + beta2_pow->data(), + beta1_pow_out->mutable_data(ctx.GetPlace()), + beta2_pow_out->mutable_data(ctx.GetPlace())); + } + } + } else if (grad_var->IsType()) { + auto* grad = ctx.Input("Grad"); + if (grad->rows().size() == 0) { + VLOG(3) << "grad row size is 0!!"; + return; + } + + std::vector cpu_rows(grad->rows().begin(), grad->rows().end()); + bool is_strict_sorted = true; + for (size_t i = 1; i < cpu_rows.size(); ++i) { + if (cpu_rows[i - 1] >= cpu_rows[i]) { + is_strict_sorted = false; + break; + } + } + + phi::SelectedRows tmp_grad_merge; + const phi::SelectedRows* grad_merge_ptr; + if (is_strict_sorted) { + grad_merge_ptr = grad; + } else { + // merge duplicated rows if any. + // The rows of grad_merge have been sorted inside MergeAdd functor + scatter::MergeAdd merge_func; + merge_func(ctx.template device_context(), + *grad, &tmp_grad_merge, true); + grad_merge_ptr = &tmp_grad_merge; + } + auto& grad_merge = *grad_merge_ptr; + auto& grad_tensor = grad_merge.value(); + const T* grad_data = grad_tensor.template data(); + auto* grad_merge_rows = &grad_merge.rows(); + paddle::framework::MixVector mixv_grad_merge_rows( + grad_merge_rows); + const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace()); + auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); + + if (beta1_pow->place() == platform::CPUPlace() && + beta2_pow->place() == platform::CPUPlace()) { + int threads = 512; + int ndim = param->numel(); + int blocks = (ndim + threads - 1) / threads; + + SparseAdamWCUDAKernelREG< + T, MPDType><<>>( + beta1, beta2, epsilon, coeff, lr_ratio, *beta1_pow->data(), + *beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad_data, param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, rows, row_numel, grad_merge.rows().size(), + lazy_mode, ndim); + if (!use_global_beta_pow) { + // Update with cpu + beta1_pow_out->mutable_data(platform::CPUPlace())[0] = + beta1 * beta1_pow->data()[0]; + beta2_pow_out->mutable_data(platform::CPUPlace())[0] = + beta2 * beta2_pow->data()[0]; + } + } else { + SparseAdamWFunctor functor( + beta1, beta2, epsilon, coeff, lr_ratio, beta1_pow->data(), + beta2_pow->data(), mom1->data(), + mom1_out->mutable_data(ctx.GetPlace()), + mom2->data(), + mom2_out->mutable_data(ctx.GetPlace()), + lr->data(), grad_data, param->data(), + param_out->mutable_data(ctx.GetPlace()), master_in_data, + master_out_data, rows, row_numel, grad_merge.rows().size(), + lazy_mode); + + // FIXME(minqiyang): remove BinarySearch in GPU later + platform::ForRange for_range( + static_cast( + ctx.device_context()), + param->numel()); + for_range(functor); + if (!use_global_beta_pow) { + // update beta1 and beta2 + UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( + beta1, beta2, beta1_pow->data(), + beta2_pow->data(), + beta1_pow_out->mutable_data(ctx.GetPlace()), + beta2_pow_out->mutable_data(ctx.GetPlace())); + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Variable type not supported by adamw_op")); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(adamw, ops::AdamWOpCUDAKernel, + ops::AdamWOpCUDAKernel, + ops::AdamWOpCUDAKernel); diff --git a/paddle/fluid/operators/optimizers/adamw_op.h b/paddle/fluid/operators/optimizers/adamw_op.h new file mode 100644 index 0000000000000000000000000000000000000000..91ab58b0577bd841e0022b078bb1a9de72dbe58e --- /dev/null +++ b/paddle/fluid/operators/optimizers/adamw_op.h @@ -0,0 +1,213 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +namespace paddle { +namespace operators { + +class AdamWOp : public AdamOp { + using AdamOp::AdamOp; +}; + +struct GPUAdamW; +struct CPUAdamW; + +template +class AdamWFunctor; + +template +class AdamWFunctor { + private: + const T coeff_; + const T lr_ratio_; + const T* lr_; + T* param_; + + public: + AdamWFunctor(const T coeff, const T lr_ratio, const T* lr, T* param) + : coeff_(coeff), lr_ratio_(lr_ratio), lr_(lr), param_(param) {} + + inline HOSTDEVICE void operator()(size_t numel) const { + Eigen::Map> param{ + param_, static_cast(numel)}; + + T lr = *lr_; + + // Calculation + param -= lr * lr_ratio_ * coeff_ * param; + } +}; + +template +class SparseAdamWFunctor; + +template +class SparseAdamWFunctor { + private: + MT beta1_; + MT beta2_; + MT epsilon_; + MT coeff_; + MT lr_ratio_; + + const MT* beta1_pow_; + const MT* beta2_pow_; + const MT* moment1_; + MT* moment1_out_; + const MT* moment2_; + MT* moment2_out_; + const MT* lr_; + const T* grad_; + const T* param_; + T* param_out_; + const MT* master_param_; + MT* master_param_out_; + + const int64_t* rows_; + int64_t row_numel_; + int64_t row_count_; + bool lazy_mode_; + + public: + SparseAdamWFunctor(MT beta1, MT beta2, MT epsilon, MT coeff, MT lr_ratio, + const MT* beta1_pow, const MT* beta2_pow, const MT* mom1, + MT* mom1_out, const MT* mom2, MT* mom2_out, const MT* lr, + const T* grad, const T* param, T* param_out, + const MT* master_param, MT* master_param_out, + const int64_t* rows, int64_t row_numel, int64_t row_count, + bool lazy_mode) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + coeff_(coeff), + lr_ratio_(lr_ratio), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param), + param_out_(param_out), + master_param_(master_param), + master_param_out_(master_param_out), + rows_(rows), + row_numel_(row_numel), + row_count_(row_count), + lazy_mode_(lazy_mode) {} + + inline HOSTDEVICE void adamw_update(size_t i, MT g) const { + // The following code is the same as dense + MT mom1 = moment1_[i]; + MT mom2 = moment2_[i]; + MT lr = *lr_ * lr_ratio_; + MT lr_orig = lr; + MT beta1_pow = *beta1_pow_; + MT beta2_pow = *beta2_pow_; + MT p = master_param_ ? master_param_[i] : static_cast(param_[i]); + + // Calculation + lr *= sqrt(static_cast(1.0) - beta2_pow) / + (static_cast(1.0) - beta1_pow); + + mom1 = beta1_ * mom1 + (static_cast(1.0) - beta1_) * g; + mom2 = beta2_ * mom2 + (static_cast(1.0) - beta2_) * g * g; + p -= lr_orig * coeff_ * p; + p -= lr * (mom1 / (sqrt(mom2) + + epsilon_ * sqrt(static_cast(1.0) - beta2_pow))); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + param_out_[i] = static_cast(p); + if (master_param_out_) { + master_param_out_[i] = p; + } + } + + inline HOSTDEVICE void operator()(size_t i) const { + auto row_idx = + phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); + if (lazy_mode_ && row_idx < 0) { + return; + } else { + MT g = row_idx >= 0 + ? static_cast(grad_[row_idx * row_numel_ + i % row_numel_]) + : static_cast(0); + adamw_update(i, g); + } + } +}; + +template +class AdamWOpKernel : public AdamOpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.InputNames("Param").front(), + framework::ToTypeName(param_var->Type()))); + + using paddle::framework::LoDTensor; + bool skip_update = false; + // TODO(liupeng): + if (ctx.HasInput("SkipUpdate")) { + VLOG(3) << "Has SkipUpdate"; + auto* skip_update_tensor = ctx.Input("SkipUpdate"); + PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(SkipUpdate) size must be 1, but get %d", + skip_update_tensor->numel())); + std::vector skip_update_vec; + paddle::framework::TensorToVector(*skip_update_tensor, + ctx.device_context(), &skip_update_vec); + skip_update = skip_update_vec[0]; + } + VLOG(3) << "Skip update" << skip_update; + bool with_decay = ctx.Attr("with_decay"); + + if (skip_update || !with_decay) { + AdamOpKernel::Compute(ctx); + return; + } + + T coeff = static_cast(ctx.Attr("coeff")); + T lr_ratio = static_cast(ctx.Attr("lr_ratio")); + auto* lr = ctx.Input("LearningRate"); + + LoDTensor* param; + + if (ctx.HasInput("MasterParam")) { + // TODO(liupeng): master + param = const_cast(ctx.Input("MasterParam")); + } else { + param = const_cast(ctx.Input("Param")); + } + + AdamWFunctor functor(coeff, lr_ratio, lr->data(), + param->data()); + functor(param->numel()); + + AdamOpKernel::Compute(ctx); + } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc index d86d2bd2ffb4a0166934c9104983f664efe8b2b4..56fa11d2b08576530f955f27f9577fe3129363fa 100644 --- a/paddle/fluid/operators/optimizers/adamw_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adamw_op_xpu.cc @@ -13,8 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "gflags/gflags.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/optimizers/adam_op_functor.h" +#include "paddle/fluid/operators/optimizers/adam_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/optimizers/merged_adam_op.h b/paddle/fluid/operators/optimizers/merged_adam_op.h index 3ad30f54eaa2fa0ca688bb90f947306caf0af7a2..c9417158fe772817b0c50b3eb2f4183a5f094380 100644 --- a/paddle/fluid/operators/optimizers/merged_adam_op.h +++ b/paddle/fluid/operators/optimizers/merged_adam_op.h @@ -11,8 +11,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/phi/kernels/funcs/adam_functors.h" +#include "paddle/fluid/operators/optimizers/adam_op.h" namespace paddle { namespace operators { @@ -83,7 +82,7 @@ class MergedAdamOpKernel : public framework::OpKernel { size_t param_num = param.size(); for (size_t idx = 0; idx < param_num; idx++) { - phi::funcs::AdamFunctor functor( + AdamFunctor functor( beta1, beta2, epsilon, beta1_pow[idx]->data(), beta2_pow[idx]->data(), mom1[idx]->data(), mom1_out[idx]->mutable_data(ctx.GetPlace()), mom2[idx]->data(), diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 71f96668a56f3e0cdce00d3b3faa8910eb982cf6..7fb3fc4b1ed012269881f466248478f566aac494 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -32,9 +32,6 @@ ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() { } // namespace platform } // namespace paddle -PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0, - "number of threads for inner op"); - /** * NOTE(paddle-dev): This file is designed to define all public FLAGS. */ diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index f2acfe5a9962be97fd385f322e5136986ad78a28..3ce24139fe18aba96e3b8c0b96bafec77a237d7d 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -92,88 +92,6 @@ void AdagradInferMeta(const MetaTensor& param, moment_out->set_dtype(moment.dtype()); } -void AdamInferMeta(const MetaTensor& param, - const MetaTensor& grad, - const MetaTensor& learning_rate, - const MetaTensor& moment1, - const MetaTensor& moment2, - const MetaTensor& beta1_pow, - const MetaTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - MetaTensor* param_out, - MetaTensor* moment1_out, - MetaTensor* moment2_out, - MetaTensor* beta1_pow_out, - MetaTensor* beta2_pow_out, - MetaTensor* master_param_outs) { - auto lr_dims = learning_rate.dims(); - PADDLE_ENFORCE_EQ( - phi::product(lr_dims), - 1, - errors::InvalidArgument( - "The number of LearningRate shall be 1, but received %d. Maybe " - "the Input variable LearningRate has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.", - phi::product(lr_dims))); - auto beta1_pow_dims = beta1_pow.dims(); - VLOG(3) << "dims of Beta1Pow : [" << beta1_pow_dims << "]"; - PADDLE_ENFORCE_GE(phi::product(beta1_pow_dims), - 1, - errors::InvalidArgument( - "The size of Beta1 power accumulator should be greater " - "than 0, but received %d.", - phi::product(beta1_pow_dims))); - auto beta2_pow_dims = beta2_pow.dims(); - VLOG(3) << "dims of Beta2Pow : [" << beta2_pow_dims << "]"; - PADDLE_ENFORCE_GE(phi::product(beta2_pow_dims), - 1, - errors::InvalidArgument( - "The size of Beta2 power accumulator should be greater " - "than 0, but received %d.", - phi::product(beta2_pow_dims))); - - auto param_dims = param.dims(); - PADDLE_ENFORCE_EQ( - param_dims, - moment1.dims(), - errors::InvalidArgument( - "Param and Moment1 input of AdamOp should have same dimension. But " - "received Param dims: [%s], Moment1 dims: [%s].", - param_dims, - moment1.dims())); - PADDLE_ENFORCE_EQ( - param_dims, - moment2.dims(), - errors::InvalidArgument( - "Param and Moment2 input of AdamOp should have same dimension. But " - "received Param dims: [%s], Moment2 dims: [%s].", - param_dims, - moment2.dims())); - - param_out->set_dims(param_dims); - param_out->set_dtype(param.dtype()); - - moment1_out->set_dims(param_dims); - moment1_out->set_dtype(moment1.dtype()); - moment2_out->set_dims(param_dims); - moment2_out->set_dtype(moment2.dtype()); - - beta1_pow_out->set_dims(beta1_pow_dims); - beta1_pow_out->set_dtype(beta1_pow.dtype()); - beta2_pow_out->set_dims(beta2_pow_dims); - beta2_pow_out->set_dtype(beta2_pow.dtype()); -} - void AdamaxInferMeta(const MetaTensor& param, const MetaTensor& grad, const MetaTensor& learning_rate, @@ -230,55 +148,6 @@ void AdamaxInferMeta(const MetaTensor& param, inf_norm_out->set_dtype(inf_norm.dtype()); } -void AdamwInferMeta(const MetaTensor& param, - const MetaTensor& grad, - const MetaTensor& learning_rate, - const MetaTensor& moment1, - const MetaTensor& moment2, - const MetaTensor& beta1_pow, - const MetaTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - MetaTensor* param_out, - MetaTensor* moment1_out, - MetaTensor* moment2_out, - MetaTensor* beta1_pow_out, - MetaTensor* beta2_pow_out, - MetaTensor* master_param_outs) { - AdamInferMeta(param, - grad, - learning_rate, - moment1, - moment2, - beta1_pow, - beta2_pow, - master_param, - skip_update, - beta1, - beta2, - epsilon, - lazy_mode, - min_row_size_to_use_multithread, - multi_precision, - use_global_beta_pow, - param_out, - moment1_out, - moment2_out, - beta1_pow_out, - beta2_pow_out, - master_param_outs); -} - void AddNInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config) { diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index c037641d082b759760035a6d33ccf2ecece1193e..7db4480ffb2e69024bfa39d7d88003e2da775de3 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -68,55 +68,6 @@ void AdamaxInferMeta(const MetaTensor& param, MetaTensor* moment_out, MetaTensor* inf_norm_out); -void AdamInferMeta(const MetaTensor& param, - const MetaTensor& grad, - const MetaTensor& learning_rate, - const MetaTensor& moment1, - const MetaTensor& moment2, - const MetaTensor& beta1_pow, - const MetaTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - MetaTensor* param_out, - MetaTensor* moment1_out, - MetaTensor* moment2_out, - MetaTensor* beta1_pow_out, - MetaTensor* beta2_pow_out, - MetaTensor* master_param_outs); - -void AdamwInferMeta(const MetaTensor& param, - const MetaTensor& grad, - const MetaTensor& learning_rate, - const MetaTensor& moment1, - const MetaTensor& moment2, - const MetaTensor& beta1_pow, - const MetaTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - MetaTensor* param_out, - MetaTensor* moment1_out, - MetaTensor* moment2_out, - MetaTensor* beta1_pow_out, - MetaTensor* beta2_pow_out, - MetaTensor* master_param_outs); - void AddNInferMeta(const std::vector& x, MetaTensor* out, MetaConfig config = MetaConfig()); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index d4b832cef0bd253fa90c7f445667d94d886aca19..5aae2bbe36eef6f2d9fe91ea58d8e68b841d536b 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -27,14 +27,13 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel) # Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS cross_entropy_kernel adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel +set(MANUAL_BUILD_KERNELS cross_entropy_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel rnn_kernel rnn_grad_kernel warpctc_kernel warpctc_grad_kernel) -kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper) -kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel) + kernel_library(cross_entropy_kernel DEPS ${COMMON_KERNEL_DEPS} softmax cross_entropy) kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor) kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor) diff --git a/paddle/phi/kernels/adam_kernel.h b/paddle/phi/kernels/adam_kernel.h deleted file mode 100644 index f144d40d2b666c2d3785081bd1d0d795fe2513cc..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/adam_kernel.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void AdamDenseKernel(const Context& dev_ctx, - const DenseTensor& param, - const DenseTensor& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs); - -} // namespace phi diff --git a/paddle/phi/kernels/adamw_kernel.h b/paddle/phi/kernels/adamw_kernel.h deleted file mode 100644 index d7b072adda4a21ea8f3a640fbb89916922709822..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/adamw_kernel.h +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/core/dense_tensor.h" - -namespace phi { - -template -void AdamwDenseKernel(const Context& dev_ctx, - const DenseTensor& param, - const DenseTensor& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs); - -} // namespace phi diff --git a/paddle/phi/kernels/cpu/adam_kernel.cc b/paddle/phi/kernels/cpu/adam_kernel.cc deleted file mode 100644 index 1e0f5c4df9fd67bb64318a4d1a905576cf24c718..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/cpu/adam_kernel.cc +++ /dev/null @@ -1,173 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/adam_kernel.h" - -#include - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/funcs/adam_functors.h" - -DECLARE_int32(inner_op_parallelism); - -namespace phi { - -template -void AdamDenseKernel(const Context& dev_ctx, - const DenseTensor& param, - const DenseTensor& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update_) { - VLOG(4) << "Adam skip update"; - phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); - phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); - phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, beta2_pow_out); - - return; - } - - T beta1_ = beta1.to(); - T beta2_ = beta2.to(); - T epsilon_ = epsilon.to(); - - VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel(); - VLOG(3) << "beta2_pow.numel() : " << beta2_pow.numel(); - VLOG(3) << "param.numel(): " << param.numel(); - - PADDLE_ENFORCE_EQ( - beta1_pow_out->numel(), - 1, - errors::InvalidArgument("beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ( - beta2_pow_out->numel(), - 1, - errors::InvalidArgument("beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - T beta1_p = beta1_pow.data()[0]; - T beta2_p = beta2_pow.data()[0]; - - if (!use_global_beta_pow) { - dev_ctx.template Alloc(beta1_pow_out)[0] = beta1_ * beta1_p; - dev_ctx.template Alloc(beta2_pow_out)[0] = beta2_ * beta2_p; - } - - T* param_out_ptr = dev_ctx.template Alloc(param_out); - T* mom1_out_ptr = dev_ctx.template Alloc(moment1_out); - T* mom2_out_ptr = dev_ctx.template Alloc(moment2_out); - - T learning_rate_ = - learning_rate.data()[0] * (sqrt(1 - beta2_p) / (1 - beta1_p)); - T eps = epsilon_ * sqrt(1 - beta2_p); - - paddle::operators::jit::adam_attr_t attr(beta1_, beta2_); - int64_t numel = param.numel(); - - const T* param_ptr = param.data(); - const T* mom1_ptr = moment1.data(); - const T* mom2_ptr = moment2.data(); - const T* grad_ptr = grad.data(); - - auto adam = - paddle::operators::jit::KernelFuncs, - phi::CPUPlace>::Cache() - .At(attr); - - static constexpr int64_t chunk_size = 512; - -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int64_t i = 0; i < numel / chunk_size; ++i) { - const int64_t offset = i * chunk_size; - adam(beta1_, - beta2_, - -learning_rate_, - eps, - chunk_size, - grad_ptr + offset, - mom1_ptr + offset, - mom2_ptr + offset, - param_ptr + offset, - mom1_out_ptr + offset, - mom2_out_ptr + offset, - param_out_ptr + offset); - } - - if (numel % chunk_size != 0) { - const int64_t offset = (numel / chunk_size) * chunk_size; - const int64_t tail_numel = numel % chunk_size; - adam(beta1_, - beta2_, - -learning_rate_, - eps, - tail_numel, - grad_ptr + offset, - mom1_ptr + offset, - mom2_ptr + offset, - param_ptr + offset, - mom1_out_ptr + offset, - mom2_out_ptr + offset, - param_out_ptr + offset); - } -} - -} // namespace phi - -PD_REGISTER_KERNEL(adam, CPU, ALL_LAYOUT, phi::AdamDenseKernel, float, double) { -} diff --git a/paddle/phi/kernels/cpu/adamw_kernel.cc b/paddle/phi/kernels/cpu/adamw_kernel.cc deleted file mode 100644 index 3a7869a062cf13dbf4c742653b3e12d66afc7595..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/cpu/adamw_kernel.cc +++ /dev/null @@ -1,135 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/adamw_kernel.h" - -#include - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/adam_kernel.h" -#include "paddle/phi/kernels/funcs/adam_functors.h" - -namespace phi { - -template -void AdamwDenseKernel(const Context& dev_ctx, - const DenseTensor& param, - const DenseTensor& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - VLOG(3) << "Skip update" << skip_update_; - - if (skip_update_ || !with_decay) { - AdamDenseKernel(dev_ctx, - param, - grad, - learning_rate, - moment1, - moment2, - beta1_pow, - beta2_pow, - master_param, - skip_update, - beta1, - beta2, - epsilon, - lazy_mode, - min_row_size_to_use_multithread, - multi_precision, - use_global_beta_pow, - param_out, - moment1_out, - moment2_out, - beta1_pow_out, - beta2_pow_out, - master_param_outs); - return; - } - - auto* param_ = - master_param.is_initialized() ? master_param.get_ptr() : ¶m; - T coeff_ = static_cast(coeff); - T lr_ratio_ = static_cast(lr_ratio); - - funcs::AdamWFunctor functor( - coeff_, - lr_ratio_, - learning_rate.data(), - const_cast(param_->data())); - functor(param_->numel()); - - AdamDenseKernel(dev_ctx, - param, - grad, - learning_rate, - moment1, - moment2, - beta1_pow, - beta2_pow, - master_param, - skip_update, - beta1, - beta2, - epsilon, - lazy_mode, - min_row_size_to_use_multithread, - multi_precision, - use_global_beta_pow, - param_out, - moment1_out, - moment2_out, - beta1_pow_out, - beta2_pow_out, - master_param_outs); -} - -} // namespace phi - -PD_REGISTER_KERNEL( - adamw, CPU, ALL_LAYOUT, phi::AdamwDenseKernel, float, double) {} diff --git a/paddle/phi/kernels/funcs/adam_functors.h b/paddle/phi/kernels/funcs/adam_functors.h deleted file mode 100644 index 2f706f0ef1c36de3081bc298a8b882263c493586..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/funcs/adam_functors.h +++ /dev/null @@ -1,548 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include // for sqrt in CPU and CUDA -#include - -#include "paddle/phi/kernels/funcs/algorithm.h" - -namespace phi { -namespace funcs { - -struct GPUAdam; -struct CPUAdam; - -template -class AdamFunctor; - -template -class AdamFunctor { - private: - T beta1_; - T beta2_; - T epsilon_; - - const T* beta1_pow_; - const T* beta2_pow_; - const T* moment1_; - T* moment1_out_; - const T* moment2_; - T* moment2_out_; - const T* lr_; - const T* grad_; - const T* param_; - T* param_out_; - - public: - AdamFunctor(T beta1, - T beta2, - T epsilon, - const T* beta1_pow, - const T* beta2_pow, - const T* mom1, - T* mom1_out, - const T* mom2, - T* mom2_out, - const T* lr, - const T* grad, - const T* param, - T* param_out) - : beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - lr_(lr), - grad_(grad), - param_(param), - param_out_(param_out) {} - - inline HOSTDEVICE void operator()(size_t i) const { - // Merge all memory access together. - T g = grad_[i]; - T mom1 = moment1_[i]; - T mom2 = moment2_[i]; - T lr = *lr_; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; - T p = param_[i]; - - // Calculation - lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - - mom1 = beta1_ * mom1 + (1 - beta1_) * g; - mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); - - // Write back to global memory - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - param_out_[i] = p; - } -}; - -template -class AdamFunctor { - private: - T beta1_; - T beta2_; - T epsilon_; - - const T* beta1_pow_; - const T* beta2_pow_; - const T* moment1_; - T* moment1_out_; - const T* moment2_; - T* moment2_out_; - const T* lr_; - const T* grad_; - const T* param_; - T* param_out_; - - public: - AdamFunctor(T beta1, - T beta2, - T epsilon, - const T* beta1_pow, - const T* beta2_pow, - const T* mom1, - T* mom1_out, - const T* mom2, - T* mom2_out, - const T* lr, - const T* grad, - const T* param, - T* param_out) - : beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - lr_(lr), - grad_(grad), - param_(param), - param_out_(param_out) {} - - void operator()(size_t numel) const { - Eigen::Map> g{ - grad_, static_cast(numel)}; - Eigen::Map> mom1{ - moment1_, static_cast(numel)}; - Eigen::Map> mom2{ - moment2_, static_cast(numel)}; - Eigen::Map> param{ - param_, static_cast(numel)}; - - Eigen::Map> param_out{ - param_out_, static_cast(numel)}; - Eigen::Map> moment1_out{ - moment1_out_, static_cast(numel)}; - Eigen::Map> moment2_out{ - moment2_out_, static_cast(numel)}; - - T lr = *lr_; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; - - // Calculation - lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - - moment1_out = beta1_ * mom1 + (1 - beta1_) * g; - moment2_out = beta2_ * mom2 + (1 - beta2_) * g * g; - param_out = param - - lr * (moment1_out / - (moment2_out.sqrt() + epsilon_ * sqrt(1 - beta2_pow))); - } -}; - -template -class SparseAdamFunctor; - -template -class SparseAdamFunctor { - private: - MT beta1_; - MT beta2_; - MT epsilon_; - - const MT* beta1_pow_; - const MT* beta2_pow_; - const MT* moment1_; - MT* moment1_out_; - const MT* moment2_; - MT* moment2_out_; - const MT* lr_; - const T* grad_; - const T* param_; - T* param_out_; - const MT* master_param_; - MT* master_param_out_; - - const int64_t* rows_; - int64_t row_numel_; - int64_t row_count_; - bool lazy_mode_; - - public: - SparseAdamFunctor(MT beta1, - MT beta2, - MT epsilon, - const MT* beta1_pow, - const MT* beta2_pow, - const MT* mom1, - MT* mom1_out, - const MT* mom2, - MT* mom2_out, - const MT* lr, - const T* grad, - const T* param, - T* param_out, - const MT* master_param, - MT* master_param_out, - const int64_t* rows, - int64_t row_numel, - int64_t row_count, - bool lazy_mode) - : beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - lr_(lr), - grad_(grad), - param_(param), - param_out_(param_out), - master_param_(master_param), - master_param_out_(master_param_out), - rows_(rows), - row_numel_(row_numel), - row_count_(row_count), - lazy_mode_(lazy_mode) {} - - inline HOSTDEVICE void adam_update(size_t i, MT g) const { - // The following code is the same as dense - MT mom1 = moment1_[i]; - MT mom2 = moment2_[i]; - MT lr = *lr_; - MT beta1_pow = *beta1_pow_; - MT beta2_pow = *beta2_pow_; - MT p = master_param_ ? master_param_[i] : static_cast(param_[i]); - - // Calculation - lr *= sqrt(static_cast(1.0) - beta2_pow) / - (static_cast(1.0) - beta1_pow); - - mom1 = beta1_ * mom1 + (static_cast(1.0) - beta1_) * g; - mom2 = beta2_ * mom2 + (static_cast(1.0) - beta2_) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + - epsilon_ * sqrt(static_cast(1.0) - beta2_pow))); - - // Write back to global memory - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - param_out_[i] = static_cast(p); - if (master_param_out_) { - master_param_out_[i] = p; - } - } - - inline HOSTDEVICE void operator()(size_t i) const { - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); - if (lazy_mode_ && row_idx < 0) { - return; - } else { - MT g = row_idx >= 0 - ? static_cast(grad_[row_idx * row_numel_ + i % row_numel_]) - : static_cast(0); - adam_update(i, g); - } - } -}; - -template -class SparseAdamFunctor { - private: - T beta1_; - T beta2_; - T epsilon_; - - const T* beta1_pow_; - const T* beta2_pow_; - const T* moment1_; - T* moment1_out_; - const T* moment2_; - T* moment2_out_; - const T* lr_; - const T* grad_; - const T* param_; - T* param_out_; - - const int64_t* rows_; - int64_t row_numel_; - int64_t row_count_; - - public: - SparseAdamFunctor(T beta1, - T beta2, - T epsilon, - const T* beta1_pow, - const T* beta2_pow, - const T* mom1, - T* mom1_out, - const T* mom2, - T* mom2_out, - const T* lr, - const T* grad, - const T* param, - T* param_out, - const int64_t* rows, - int64_t row_numel, - int64_t row_count, - bool lazy_mode) - : beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - lr_(lr), - grad_(grad), - param_(param), - param_out_(param_out), - rows_(rows), - row_numel_(row_numel), - row_count_(row_count) {} - - inline HOSTDEVICE void adam_update(size_t i, T g) const { - // The following code is the same as dense - T mom1 = moment1_[i]; - T mom2 = moment2_[i]; - T lr = *lr_; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; - T p = param_[i]; - - // Calculation - lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - - mom1 = beta1_ * mom1 + (1 - beta1_) * g; - mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; - p -= lr * (mom1 / (sqrt(mom2) + epsilon_ * sqrt(1 - beta2_pow))); - - // Write back to global memory - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - param_out_[i] = p; - } - - inline void operator()(size_t numel) const { - // lr could be reuse - T lr = *lr_; - T beta1_pow = *beta1_pow_; - T beta2_pow = *beta2_pow_; - lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); - int64_t row_count = static_cast(numel / row_numel_); - - for (int64_t i = 0, j = 0; i != row_count; ++i) { - if (i == *(rows_ + j)) { - for (int64_t k = 0; k != row_numel_; ++k) { - T g = grad_[j * row_numel_ + k]; - adam_update(i * row_numel_ + k, g); - } - ++j; - } else { - for (int64_t k = 0; k != row_numel_; ++k) { - T mom1 = moment1_[i * row_numel_ + k]; - T mom2 = moment2_[i * row_numel_ + k]; - T p = param_[i * row_numel_ + k]; - - mom1 = beta1_ * mom1; - mom2 = beta2_ * mom2; - - p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); - // Write back to global memory - moment1_out_[i * row_numel_ + k] = mom1; - moment2_out_[i * row_numel_ + k] = mom2; - param_out_[i * row_numel_ + k] = p; - } - } - } - } -}; - -struct GPUAdamW; -struct CPUAdamW; - -template -class AdamWFunctor; - -template -class AdamWFunctor { - private: - const T coeff_; - const T lr_ratio_; - const T* lr_; - T* param_; - - public: - AdamWFunctor(const T coeff, const T lr_ratio, const T* lr, T* param) - : coeff_(coeff), lr_ratio_(lr_ratio), lr_(lr), param_(param) {} - - inline HOSTDEVICE void operator()(size_t numel) const { - Eigen::Map> param{ - param_, static_cast(numel)}; - - T lr = *lr_; - - // Calculation - param -= lr * lr_ratio_ * coeff_ * param; - } -}; - -template -class SparseAdamWFunctor; - -template -class SparseAdamWFunctor { - private: - MT beta1_; - MT beta2_; - MT epsilon_; - MT coeff_; - MT lr_ratio_; - - const MT* beta1_pow_; - const MT* beta2_pow_; - const MT* moment1_; - MT* moment1_out_; - const MT* moment2_; - MT* moment2_out_; - const MT* lr_; - const T* grad_; - const T* param_; - T* param_out_; - const MT* master_param_; - MT* master_param_out_; - - const int64_t* rows_; - int64_t row_numel_; - int64_t row_count_; - bool lazy_mode_; - - public: - SparseAdamWFunctor(MT beta1, - MT beta2, - MT epsilon, - MT coeff, - MT lr_ratio, - const MT* beta1_pow, - const MT* beta2_pow, - const MT* mom1, - MT* mom1_out, - const MT* mom2, - MT* mom2_out, - const MT* lr, - const T* grad, - const T* param, - T* param_out, - const MT* master_param, - MT* master_param_out, - const int64_t* rows, - int64_t row_numel, - int64_t row_count, - bool lazy_mode) - : beta1_(beta1), - beta2_(beta2), - epsilon_(epsilon), - coeff_(coeff), - lr_ratio_(lr_ratio), - beta1_pow_(beta1_pow), - beta2_pow_(beta2_pow), - moment1_(mom1), - moment1_out_(mom1_out), - moment2_(mom2), - moment2_out_(mom2_out), - lr_(lr), - grad_(grad), - param_(param), - param_out_(param_out), - master_param_(master_param), - master_param_out_(master_param_out), - rows_(rows), - row_numel_(row_numel), - row_count_(row_count), - lazy_mode_(lazy_mode) {} - - inline HOSTDEVICE void adamw_update(size_t i, MT g) const { - // The following code is the same as dense - MT mom1 = moment1_[i]; - MT mom2 = moment2_[i]; - MT lr = *lr_ * lr_ratio_; - MT lr_orig = lr; - MT beta1_pow = *beta1_pow_; - MT beta2_pow = *beta2_pow_; - MT p = master_param_ ? master_param_[i] : static_cast(param_[i]); - - // Calculation - lr *= sqrt(static_cast(1.0) - beta2_pow) / - (static_cast(1.0) - beta1_pow); - - mom1 = beta1_ * mom1 + (static_cast(1.0) - beta1_) * g; - mom2 = beta2_ * mom2 + (static_cast(1.0) - beta2_) * g * g; - p -= lr_orig * coeff_ * p; - p -= lr * (mom1 / (sqrt(mom2) + - epsilon_ * sqrt(static_cast(1.0) - beta2_pow))); - - // Write back to global memory - moment1_out_[i] = mom1; - moment2_out_[i] = mom2; - param_out_[i] = static_cast(p); - if (master_param_out_) { - master_param_out_[i] = p; - } - } - - inline HOSTDEVICE void operator()(size_t i) const { - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count_, i / row_numel_); - if (lazy_mode_ && row_idx < 0) { - return; - } else { - MT g = row_idx >= 0 - ? static_cast(grad_[row_idx * row_numel_ + i % row_numel_]) - : static_cast(0); - adamw_update(i, g); - } - } -}; - -} // namespace funcs -} // namespace phi diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu deleted file mode 100644 index d3317e258e5382d5d2ca49916da056e8f8506527..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/gpu/adam_kernel.cu +++ /dev/null @@ -1,275 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/adam_kernel.h" - -#include // for sqrt in CPU and CUDA -#include - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/funcs/adam_functors.h" -#include "paddle/phi/kernels/funcs/for_range.h" - -namespace phi { - -template -__global__ void AdamKernelREG(MT beta1, - MT beta2, - MT epsilon, - MT beta1_pow_, - MT beta2_pow_, - const MT* moment1, - MT* moment1_out, - const MT* moment2, - MT* moment2_out, - const MT* lr_, - const T* grad, - const T* param, - T* param_out, - const MT* master_param, - MT* master_param_out, - int ndim) { - MT lr = *lr_; - MT beta1_pow = beta1_pow_; - MT beta2_pow = beta2_pow_; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - for (; id < ndim; id += gridDim.x * blockDim.x) { - MT p = master_param ? master_param[id] : static_cast(param[id]); - MT g = static_cast(grad[id]); - MT mom1 = static_cast(moment1[id]); - MT mom2 = static_cast(moment2[id]); - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - moment1_out[id] = mom1; - moment2_out[id] = mom2; - param_out[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } -} - -template -__global__ void AdamKernelMEM(MT beta1, - MT beta2, - MT epsilon, - const MT* beta1_pow_, - const MT* beta2_pow_, - const MT* moment1, - MT* moment1_out, - const MT* moment2, - MT* moment2_out, - const MT* lr_, - const T* grad, - const T* param, - T* param_out, - const MT* master_param, - MT* master_param_out, - int ndim) { - MT lr = *lr_; - MT beta1_pow = *beta1_pow_; - MT beta2_pow = *beta2_pow_; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - for (; id < ndim; id += gridDim.x * blockDim.x) { - MT p = master_param ? master_param[id] : static_cast(param[id]); - MT g = static_cast(grad[id]); - MT mom1 = static_cast(moment1[id]); - MT mom2 = static_cast(moment2[id]); - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - moment1_out[id] = mom1; - moment2_out[id] = mom2; - param_out[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } -} - -template -__global__ void UpdateBetaPow(T beta1, - T beta2, - const T* beta1_pow_, - const T* beta2_pow_, - T* beta1_pow_out, - T* beta2_pow_out) { - *beta1_pow_out = beta1 * beta1_pow_[0]; - *beta2_pow_out = beta2 * beta2_pow_[0]; -} - -template -void AdamDenseKernel(const Context& dev_ctx, - const DenseTensor& param, - const DenseTensor& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update_) { - VLOG(4) << "Adam skip update"; - phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); - phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); - phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, beta2_pow_out); - return; - } - - MPDType beta1_ = beta1.to(); - MPDType beta2_ = beta2.to(); - MPDType epsilon_ = epsilon.to(); - VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() - << "beta2_pow.numel() : " << beta2_pow.numel(); - VLOG(3) << "param.numel(): " << param.numel(); - PADDLE_ENFORCE_EQ( - beta1_pow_out->numel(), - 1, - errors::InvalidArgument("beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ( - beta2_pow_out->numel(), - 1, - errors::InvalidArgument("beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - const MPDType* master_in_data = - multi_precision ? master_param->data() : nullptr; - MPDType* master_out_data = - multi_precision ? dev_ctx.template Alloc(master_param_outs) - : nullptr; - - // update param and moment - int threads = 512; - int blocks = (param.numel() + threads - 1) / threads; - - if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { - // Compute with betapow in REG - AdamKernelREG<<>>( - beta1_, - beta2_, - epsilon_, - *beta1_pow.data(), - *beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad.data(), - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - param.numel()); - if (!use_global_beta_pow) { - // Cpu update - dev_ctx.template HostAlloc(beta1_pow_out)[0] = - beta1_ * beta1_pow.data()[0]; - dev_ctx.template HostAlloc(beta2_pow_out)[0] = - beta2_ * beta2_pow.data()[0]; - } - } else { - AdamKernelMEM<<>>( - beta1_, - beta2_, - epsilon_, - beta1_pow.data(), - beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad.data(), - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - param.numel()); - if (!use_global_beta_pow) { - // Update with gpu - UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1_, - beta2_, - beta1_pow.data(), - beta2_pow.data(), - dev_ctx.template Alloc(beta1_pow_out), - dev_ctx.template Alloc(beta2_pow_out)); - } - } -} - -} // namespace phi - -PD_REGISTER_KERNEL(adam, - GPU, - ALL_LAYOUT, - phi::AdamDenseKernel, - float, - double, - phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/adamw_kernel.cu b/paddle/phi/kernels/gpu/adamw_kernel.cu deleted file mode 100644 index 8fef101383bb09c49c88a6cc36ddf8af46e8be65..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/gpu/adamw_kernel.cu +++ /dev/null @@ -1,302 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/adamw_kernel.h" - -#include // for sqrt in CPU and CUDA -#include - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/funcs/adam_functors.h" -#include "paddle/phi/kernels/funcs/for_range.h" - -namespace phi { -template -__global__ void AdamWKernelREG(MT beta1, - MT beta2, - MT epsilon, - MT coeff, - MT lr_ratio, - MT beta1_pow_, - MT beta2_pow_, - const MT* moment1, - MT* moment1_out, - const MT* moment2, - MT* moment2_out, - const MT* lr_, - const T* grad, - const T* param, - T* param_out, - const MT* master_param, - MT* master_param_out, - int ndim) { - MT lr = *lr_ * lr_ratio; - MT beta1_pow = beta1_pow_; - MT beta2_pow = beta2_pow_; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - for (; id < ndim; id += gridDim.x * blockDim.x) { - MT p = master_param ? master_param[id] : static_cast(param[id]); - MT g = static_cast(grad[id]); - MT mom1 = static_cast(moment1[id]); - MT mom2 = static_cast(moment2[id]); - - p *= (static_cast(1.0) - lr * coeff); - - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - moment1_out[id] = mom1; - moment2_out[id] = mom2; - param_out[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } -} - -template -__global__ void AdamWKernelMEM(MT beta1, - MT beta2, - MT epsilon, - MT coeff, - MT lr_ratio, - const MT* beta1_pow_, - const MT* beta2_pow_, - const MT* moment1, - MT* moment1_out, - const MT* moment2, - MT* moment2_out, - const MT* lr_, - const T* grad, - const T* param, - T* param_out, - const MT* master_param, - MT* master_param_out, - int ndim) { - MT lr = *lr_ * lr_ratio; - MT beta1_pow = *beta1_pow_; - MT beta2_pow = *beta2_pow_; - - int id = blockIdx.x * blockDim.x + threadIdx.x; - - for (; id < ndim; id += gridDim.x * blockDim.x) { - MT p = master_param ? master_param[id] : static_cast(param[id]); - MT g = static_cast(grad[id]); - MT mom1 = static_cast(moment1[id]); - MT mom2 = static_cast(moment2[id]); - - p *= (static_cast(1.0) - lr * coeff); - - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - moment1_out[id] = mom1; - moment2_out[id] = mom2; - param_out[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } -} - -template -__global__ void UpdateAdamWBetaPow(T beta1, - T beta2, - const T* beta1_pow_, - const T* beta2_pow_, - T* beta1_pow_out, - T* beta2_pow_out) { - *beta1_pow_out = beta1 * beta1_pow_[0]; - *beta2_pow_out = beta2 * beta2_pow_[0]; -} - -template -void AdamwDenseKernel(const Context& dev_ctx, - const DenseTensor& param, - const DenseTensor& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - MPDType coeff_ = static_cast(coeff); - MPDType lr_ratio_ = static_cast(lr_ratio); - - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update_) { - VLOG(4) << "Adamw skip update"; - phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); - phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); - phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); - return; - } - - // if with_decay = false, coeff = 0 - if (!with_decay) { - coeff_ = static_cast(0.0); - } - - MPDType beta1_ = beta1.to(); - MPDType beta2_ = beta2.to(); - MPDType epsilon_ = epsilon.to(); - VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() - << "beta2_pow.numel() : " << beta2_pow.numel(); - VLOG(3) << "param.numel(): " << param.numel(); - PADDLE_ENFORCE_EQ( - beta1_pow_out->numel(), - 1, - errors::InvalidArgument("beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ( - beta2_pow_out->numel(), - 1, - errors::InvalidArgument("beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - const MPDType* master_in_data = - multi_precision ? master_param->data() : nullptr; - MPDType* master_out_data = - multi_precision ? dev_ctx.template Alloc(master_param_outs) - : nullptr; - - // update param and moment - int threads = 512; - int blocks = (param.numel() + threads - 1) / threads; - - if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { - // Compute with betapow in REG - AdamWKernelREG<<>>( - beta1_, - beta2_, - epsilon_, - coeff_, - lr_ratio_, - *beta1_pow.data(), - *beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad.data(), - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - param.numel()); - if (!use_global_beta_pow) { - // Cpu update - dev_ctx.template HostAlloc(beta1_pow_out)[0] = - beta1_ * beta1_pow.data()[0]; - dev_ctx.template HostAlloc(beta2_pow_out)[0] = - beta2_ * beta2_pow.data()[0]; - } - } else { - AdamWKernelMEM<<>>( - beta1_, - beta2_, - epsilon_, - coeff_, - lr_ratio_, - beta1_pow.data(), - beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad.data(), - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - param.numel()); - if (!use_global_beta_pow) { - // Update with gpu - UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1_, - beta2_, - beta1_pow.data(), - beta2_pow.data(), - dev_ctx.template Alloc(beta1_pow_out), - dev_ctx.template Alloc(beta2_pow_out)); - } - } -} - -} // namespace phi - -PD_REGISTER_KERNEL(adamw, - GPU, - ALL_LAYOUT, - phi::AdamwDenseKernel, - float, - double, - phi::dtype::float16) {} diff --git a/paddle/phi/kernels/selected_rows/adam_kernel.h b/paddle/phi/kernels/selected_rows/adam_kernel.h deleted file mode 100644 index 2e13d29d172844abbdf097590b4d19d7b0fede11..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/selected_rows/adam_kernel.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/selected_rows.h" - -namespace phi { -namespace sr { - -template -void AdamDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs); - -} // namespace sr -} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/adamw_kernel.h b/paddle/phi/kernels/selected_rows/adamw_kernel.h deleted file mode 100644 index ddb155ce4504e964a40777db81785c5f85765cce..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/selected_rows/adamw_kernel.h +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/selected_rows.h" - -namespace phi { -namespace sr { - -template -void AdamwDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs); - -} // namespace sr -} // namespace phi diff --git a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc deleted file mode 100644 index 57e33beb95e3e272a1d3255a585d61f25f09c306..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/selected_rows/cpu/adam_kernel.cc +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/selected_rows/adam_kernel.h" - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/funcs/adam_functors.h" - -namespace phi { -namespace sr { - -template -void AdamDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update_) { - VLOG(4) << "Adam skip update"; - phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); - phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); - phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); - return; - } - - T beta1_ = beta1.to(); - T beta2_ = beta2.to(); - T epsilon_ = epsilon.to(); - - VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel(); - VLOG(3) << "beta2_pow.numel() : " << beta2_pow.numel(); - VLOG(3) << "param.numel(): " << param.numel(); - - PADDLE_ENFORCE_EQ( - beta1_pow_out->numel(), - 1, - errors::InvalidArgument("beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ( - beta2_pow_out->numel(), - 1, - errors::InvalidArgument("beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - if (grad.rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; - return; - } - - std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); - bool is_strict_sorted = true; - for (size_t i = 1; i < cpu_rows.size(); ++i) { - if (cpu_rows[i - 1] >= cpu_rows[i]) { - is_strict_sorted = false; - break; - } - } - - phi::SelectedRows tmp_grad_merge; - const phi::SelectedRows* grad_merge_ptr; - if (is_strict_sorted) { - grad_merge_ptr = &grad; - } else { - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - paddle::operators::math::scatter::MergeAdd merge_func; - merge_func(dev_ctx, grad, &tmp_grad_merge, true); - grad_merge_ptr = &tmp_grad_merge; - } - - auto& grad_merge = *grad_merge_ptr; - auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); - auto* grad_merge_rows = &grad_merge.rows(); - paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); - const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); - auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - - funcs::SparseAdamFunctor functor( - beta1_, - beta2_, - epsilon_, - beta1_pow.data(), - beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad_data, - param.data(), - dev_ctx.template Alloc(param_out), - rows, - row_numel, - grad_merge.rows().size(), - lazy_mode); - // update beta1 and beta2 - if (!use_global_beta_pow) { - dev_ctx.template Alloc(beta1_pow_out)[0] = - beta1_ * beta1_pow.data()[0]; - dev_ctx.template Alloc(beta2_pow_out)[0] = - beta2_ * beta2_pow.data()[0]; - } - if (lazy_mode) { - VLOG(3) << "run cpu lazy mode"; - size_t row_count = grad_merge.rows().size(); - std::vector cpu_rows(grad_merge.rows()); - for (size_t row_index = 0; row_index < row_count; ++row_index) { - for (size_t offset = 0; offset < row_numel; ++offset) { - size_t i = cpu_rows[row_index] * row_numel + offset; - functor.adam_update(i, grad_data[row_index * row_numel + offset]); - } - } - } -#ifndef _WIN32 - else if (FLAGS_inner_op_parallelism > 1 && // NOLINT - min_row_size_to_use_multithread > 0 && - param.dims()[0] > min_row_size_to_use_multithread) { - VLOG(3) << "use multi thread, inner_op_parallelism=" - << FLAGS_inner_op_parallelism << " min_row_size_to_use_multithread=" - << min_row_size_to_use_multithread; - if (FLAGS_inner_op_parallelism > 10) { - VLOG(1) << "FLAGS_inner_op_parallelism " << FLAGS_inner_op_parallelism - << " is two large!"; - } - auto& grad_rows = grad_merge.rows(); - std::unordered_map row_id_to_grad_row_offset; - size_t param_row_count = param.numel() / row_numel; - if (param_row_count < 1000) { - VLOG(1) << "param_row_count should be larger then 1000 to use " - "multi thread, currently " - << param_row_count; - } - for (size_t i = 0; i < grad_rows.size(); ++i) { - row_id_to_grad_row_offset[grad_rows[i]] = i; - } - std::vector> fs; - int64_t line_in_each_thread = - param_row_count / FLAGS_inner_op_parallelism + 1; - for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { - int64_t start = i * line_in_each_thread; - int64_t end = (i + 1) * line_in_each_thread; - if (start >= static_cast(param_row_count)) { - break; - } - if (end > static_cast(param_row_count)) { - end = static_cast(param_row_count); - } - fs.push_back(paddle::framework::Async([&functor, - &row_id_to_grad_row_offset, - &grad_data, - row_numel, - start, - end]() { - for (int64_t row_id = start; row_id < end; ++row_id) { - auto iter = row_id_to_grad_row_offset.find(row_id); - if (iter != row_id_to_grad_row_offset.end()) { - for (size_t row_offset = 0U; row_offset < row_numel; ++row_offset) { - functor.adam_update( - row_id * row_numel + row_offset, - grad_data[iter->second * row_numel + row_offset]); - } - } else { - for (size_t row_offset = 0U; row_offset < row_numel; ++row_offset) { - functor.adam_update(row_id * row_numel + row_offset, 0); - } - } - } - })); - } - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } -#endif // !_WIN32 - else { // NOLINT - functor(param.numel()); - } -} - -} // namespace sr -} // namespace phi - -PD_REGISTER_KERNEL(adam_dense_param_sparse_grad, - CPU, - ALL_LAYOUT, - phi::sr::AdamDenseParamSparseGradKernel, - float, - double) {} diff --git a/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc deleted file mode 100644 index a52bca761108c49be303c3a371734f2ee2ec2e1a..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/selected_rows/cpu/adamw_kernel.cc +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/selected_rows/adamw_kernel.h" - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/adam_kernel.h" -#include "paddle/phi/kernels/funcs/adam_functors.h" -#include "paddle/phi/kernels/selected_rows/adam_kernel.h" - -namespace phi { -namespace sr { - -template -void AdamwDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - VLOG(3) << "Skip update" << skip_update_; - - if (skip_update_ || !with_decay) { - AdamDenseParamSparseGradKernel(dev_ctx, - param, - grad, - learning_rate, - moment1, - moment2, - beta1_pow, - beta2_pow, - master_param, - skip_update, - beta1, - beta2, - epsilon, - lazy_mode, - min_row_size_to_use_multithread, - multi_precision, - use_global_beta_pow, - param_out, - moment1_out, - moment2_out, - beta1_pow_out, - beta2_pow_out, - master_param_outs); - return; - } - - auto* param_ = - master_param.is_initialized() ? master_param.get_ptr() : ¶m; - T coeff_ = static_cast(coeff); - T lr_ratio_ = static_cast(lr_ratio); - funcs::AdamWFunctor functor( - coeff_, - lr_ratio_, - learning_rate.data(), - const_cast(param_->data())); - functor(param_->numel()); - - AdamDenseParamSparseGradKernel(dev_ctx, - param, - grad, - learning_rate, - moment1, - moment2, - beta1_pow, - beta2_pow, - master_param, - skip_update, - beta1, - beta2, - epsilon, - lazy_mode, - min_row_size_to_use_multithread, - multi_precision, - use_global_beta_pow, - param_out, - moment1_out, - moment2_out, - beta1_pow_out, - beta2_pow_out, - master_param_outs); -} - -} // namespace sr -} // namespace phi - -PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad, - CPU, - ALL_LAYOUT, - phi::sr::AdamwDenseParamSparseGradKernel, - float, - double) {} diff --git a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu deleted file mode 100644 index 32c05765a9ab0fa239d459dc28b4573ee29eb7cb..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/selected_rows/gpu/adam_kernel.cu +++ /dev/null @@ -1,287 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/selected_rows/adam_kernel.h" - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/funcs/adam_functors.h" -#include "paddle/phi/kernels/funcs/for_range.h" - -namespace phi { -namespace sr { - -template -__global__ void UpdateBetaPow(T beta1, - T beta2, - const T* beta1_pow_, - const T* beta2_pow_, - T* beta1_pow_out, - T* beta2_pow_out) { - *beta1_pow_out = beta1 * beta1_pow_[0]; - *beta2_pow_out = beta2 * beta2_pow_[0]; -} - -template -__global__ void SparseAdamCUDAKernelREG(MT beta1, - MT beta2, - MT epsilon, - const MT beta1_pow, - const MT beta2_pow, - const MT* mom1_, - MT* mom1_out_, - const MT* mom2_, - MT* mom2_out_, - const MT* lr_, - const T* grad_, - const T* param_, - T* param_out_, - const MT* master_param, - MT* master_param_out, - const int64_t* rows_, - int64_t row_numel, - int64_t row_count, - bool lazy_mode, - int ndim) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - MT lr = *lr_; - - for (; id < ndim; id += blockDim.x * gridDim.x) { - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count, id / row_numel); - if (lazy_mode && row_idx < 0) { - return; - } else { - MT mom1 = mom1_[id]; - MT mom2 = mom2_[id]; - MT p = master_param ? master_param[id] : static_cast(param_[id]); - MT g = row_idx >= 0 - ? static_cast(grad_[row_idx * row_numel + id % row_numel]) - : static_cast(0); - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = - (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - // Write back to global memory - mom1_out_[id] = mom1; - mom2_out_[id] = mom2; - param_out_[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } - } -} - -template -void AdamDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update_) { - VLOG(4) << "Adam skip update"; - phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); - phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); - phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); - return; - } - - MPDType beta1_ = beta1.to(); - MPDType beta2_ = beta2.to(); - MPDType epsilon_ = epsilon.to(); - VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() - << "beta2_pow.numel() : " << beta2_pow.numel(); - VLOG(3) << "param.numel(): " << param.numel(); - PADDLE_ENFORCE_EQ( - beta1_pow_out->numel(), - 1, - errors::InvalidArgument("beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ( - beta2_pow_out->numel(), - 1, - errors::InvalidArgument("beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - const MPDType* master_in_data = - multi_precision ? master_param->data() : nullptr; - MPDType* master_out_data = - multi_precision ? dev_ctx.template Alloc(master_param_outs) - : nullptr; - - if (grad.rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; - return; - } - - std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); - bool is_strict_sorted = true; - for (size_t i = 1; i < cpu_rows.size(); ++i) { - if (cpu_rows[i - 1] >= cpu_rows[i]) { - is_strict_sorted = false; - break; - } - } - - phi::SelectedRows tmp_grad_merge; - const phi::SelectedRows* grad_merge_ptr; - if (is_strict_sorted) { - grad_merge_ptr = &grad; - } else { - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - paddle::operators::math::scatter::MergeAdd merge_func; - merge_func(dev_ctx, grad, &tmp_grad_merge, true); - grad_merge_ptr = &tmp_grad_merge; - } - auto& grad_merge = *grad_merge_ptr; - auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); - auto* grad_merge_rows = &grad_merge.rows(); - paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); - const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); - auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - - if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { - int threads = 512; - int ndim = param.numel(); - int blocks = (ndim + threads - 1) / threads; - - SparseAdamCUDAKernelREG<<>>( - beta1_, - beta2_, - epsilon_, - *beta1_pow.data(), - *beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad_data, - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - rows, - row_numel, - grad_merge.rows().size(), - lazy_mode, - ndim); - if (!use_global_beta_pow) { - // Update with cpu - dev_ctx.template HostAlloc(beta1_pow_out)[0] = - beta1_ * beta1_pow.data()[0]; - dev_ctx.template HostAlloc(beta2_pow_out)[0] = - beta2_ * beta2_pow.data()[0]; - } - } else { - funcs::SparseAdamFunctor functor( - beta1_, - beta2_, - epsilon_, - beta1_pow.data(), - beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad_data, - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - rows, - row_numel, - grad_merge.rows().size(), - lazy_mode); - - // FIXME(minqiyang): remove BinarySearch in GPU later - funcs::ForRange for_range(dev_ctx, param.numel()); - for_range(functor); - if (!use_global_beta_pow) { - // update beta1 and beta2 - UpdateBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1_, - beta2_, - beta1_pow.data(), - beta2_pow.data(), - dev_ctx.template Alloc(beta1_pow_out), - dev_ctx.template Alloc(beta2_pow_out)); - } - } -} - -} // namespace sr -} // namespace phi - -PD_REGISTER_KERNEL(adam_dense_param_sparse_grad, - GPU, - ALL_LAYOUT, - phi::sr::AdamDenseParamSparseGradKernel, - float, - double, - phi::dtype::float16) {} diff --git a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu deleted file mode 100644 index 2e48b8235ed72ab377879767341808944732743e..0000000000000000000000000000000000000000 --- a/paddle/phi/kernels/selected_rows/gpu/adamw_kernel.cu +++ /dev/null @@ -1,313 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/kernels/selected_rows/adamw_kernel.h" - -#include // for sqrt in CPU and CUDA -#include - -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/amp_type_traits.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/copy_kernel.h" -#include "paddle/phi/kernels/funcs/adam_functors.h" -#include "paddle/phi/kernels/funcs/for_range.h" - -namespace phi { -namespace sr { - -template -__global__ void UpdateAdamWBetaPow(T beta1, - T beta2, - const T* beta1_pow_, - const T* beta2_pow_, - T* beta1_pow_out, - T* beta2_pow_out) { - *beta1_pow_out = beta1 * beta1_pow_[0]; - *beta2_pow_out = beta2 * beta2_pow_[0]; -} - -template -__global__ void SparseAdamWCUDAKernelREG(MT beta1, - MT beta2, - MT epsilon, - MT coeff, - MT lr_ratio, - const MT beta1_pow, - const MT beta2_pow, - const MT* mom1_, - MT* mom1_out_, - const MT* mom2_, - MT* mom2_out_, - const MT* lr_, - const T* grad_, - const T* param_, - T* param_out_, - const MT* master_param, - MT* master_param_out, - const int64_t* rows_, - int64_t row_numel, - int64_t row_count, - bool lazy_mode, - int ndim) { - int id = blockIdx.x * blockDim.x + threadIdx.x; - MT lr = *lr_ * lr_ratio; - - for (; id < ndim; id += blockDim.x * gridDim.x) { - auto row_idx = - phi::funcs::BinarySearch(rows_, row_count, id / row_numel); - if (lazy_mode && row_idx < 0) { - return; - } else { - MT mom1 = static_cast(mom1_[id]); - MT mom2 = static_cast(mom2_[id]); - - MT p = master_param ? master_param[id] : static_cast(param_[id]); - MT g = row_idx >= 0 - ? static_cast(grad_[row_idx * row_numel + id % row_numel]) - : static_cast(0); - - p *= (static_cast(1.0) - lr * coeff); - - mom1 = beta1 * mom1 + (static_cast(1.0) - beta1) * g; - mom2 = beta2 * mom2 + (static_cast(1.0) - beta2) * g * g; - - MT denom = - (sqrt(mom2) / sqrt(static_cast(1.0) - beta2_pow)) + epsilon; - - p += (mom1 / denom) * (-(lr / (static_cast(1.0) - beta1_pow))); - - // Write back to global memory - mom1_out_[id] = mom1; - mom2_out_[id] = mom2; - param_out_[id] = static_cast(p); - if (master_param_out) { - master_param_out[id] = p; - } - } - } -} - -template -void AdamwDenseParamSparseGradKernel( - const Context& dev_ctx, - const DenseTensor& param, - const SelectedRows& grad, - const DenseTensor& learning_rate, - const DenseTensor& moment1, - const DenseTensor& moment2, - const DenseTensor& beta1_pow, - const DenseTensor& beta2_pow, - paddle::optional master_param, - paddle::optional skip_update, - const Scalar& beta1, - const Scalar& beta2, - const Scalar& epsilon, - float lr_ratio, - float coeff, - bool with_decay, - bool lazy_mode, - int64_t min_row_size_to_use_multithread, - bool multi_precision, - bool use_global_beta_pow, - DenseTensor* param_out, - DenseTensor* moment1_out, - DenseTensor* moment2_out, - DenseTensor* beta1_pow_out, - DenseTensor* beta2_pow_out, - DenseTensor* master_param_outs) { - using MPDType = typename phi::dtype::MPTypeTrait::Type; - - VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow; - - MPDType coeff_ = static_cast(coeff); - MPDType lr_ratio_ = static_cast(lr_ratio); - - bool skip_update_ = false; - if (skip_update.is_initialized()) { - PADDLE_ENFORCE_EQ( - skip_update->numel(), - 1, - errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d", - skip_update->numel())); - std::vector skip_update_vec; - paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec); - skip_update_ = skip_update_vec[0]; - } - - // skip_update=true, just copy input to output, and TensorCopy will call - // mutable_data - if (skip_update_) { - VLOG(4) << "Adamw skip update"; - phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out); - phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out); - phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out); - phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out); - phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out); - return; - } - - // if with_decay = false, coeff = 0 - if (!with_decay) { - coeff_ = static_cast(0.0); - } - - MPDType beta1_ = beta1.to(); - MPDType beta2_ = beta2.to(); - MPDType epsilon_ = epsilon.to(); - VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel() - << "beta2_pow.numel() : " << beta2_pow.numel(); - VLOG(3) << "param.numel(): " << param.numel(); - PADDLE_ENFORCE_EQ( - beta1_pow_out->numel(), - 1, - errors::InvalidArgument("beta1 pow output size should be 1, but received " - "value is:%d.", - beta1_pow_out->numel())); - - PADDLE_ENFORCE_EQ( - beta2_pow_out->numel(), - 1, - errors::InvalidArgument("beta2 pow output size should be 1, but received " - "value is:%d.", - beta2_pow_out->numel())); - - const MPDType* master_in_data = - multi_precision ? master_param->data() : nullptr; - MPDType* master_out_data = - multi_precision ? dev_ctx.template Alloc(master_param_outs) - : nullptr; - - if (grad.rows().size() == 0) { - VLOG(3) << "grad row size is 0!!"; - return; - } - - std::vector cpu_rows(grad.rows().begin(), grad.rows().end()); - bool is_strict_sorted = true; - for (size_t i = 1; i < cpu_rows.size(); ++i) { - if (cpu_rows[i - 1] >= cpu_rows[i]) { - is_strict_sorted = false; - break; - } - } - - phi::SelectedRows tmp_grad_merge; - const phi::SelectedRows* grad_merge_ptr; - if (is_strict_sorted) { - grad_merge_ptr = &grad; - } else { - // merge duplicated rows if any. - // The rows of grad_merge have been sorted inside MergeAdd functor - paddle::operators::math::scatter::MergeAdd merge_func; - merge_func(dev_ctx, grad, &tmp_grad_merge, true); - grad_merge_ptr = &tmp_grad_merge; - } - auto& grad_merge = *grad_merge_ptr; - auto& grad_tensor = grad_merge.value(); - const T* grad_data = grad_tensor.template data(); - auto* grad_merge_rows = &grad_merge.rows(); - paddle::framework::MixVector mixv_grad_merge_rows(grad_merge_rows); - const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace()); - auto row_numel = grad_tensor.numel() / grad_merge.rows().size(); - - if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) { - int threads = 512; - int ndim = param.numel(); - int blocks = (ndim + threads - 1) / threads; - - SparseAdamWCUDAKernelREG<<>>( - beta1_, - beta2_, - epsilon_, - coeff_, - lr_ratio_, - *beta1_pow.data(), - *beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad_data, - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - rows, - row_numel, - grad_merge.rows().size(), - lazy_mode, - ndim); - if (!use_global_beta_pow) { - // Update with cpu - dev_ctx.template HostAlloc(beta1_pow_out)[0] = - beta1_ * beta1_pow.data()[0]; - dev_ctx.template HostAlloc(beta2_pow_out)[0] = - beta2_ * beta2_pow.data()[0]; - } - } else { - funcs::SparseAdamWFunctor functor( - beta1_, - beta2_, - epsilon_, - coeff_, - lr_ratio_, - beta1_pow.data(), - beta2_pow.data(), - moment1.data(), - dev_ctx.template Alloc(moment1_out), - moment2.data(), - dev_ctx.template Alloc(moment2_out), - learning_rate.data(), - grad_data, - param.data(), - dev_ctx.template Alloc(param_out), - master_in_data, - master_out_data, - rows, - row_numel, - grad_merge.rows().size(), - lazy_mode); - - // FIXME(minqiyang): remove BinarySearch in GPU later - funcs::ForRange for_range(dev_ctx, param.numel()); - for_range(functor); - if (!use_global_beta_pow) { - // update beta1 and beta2 - UpdateAdamWBetaPow<<<1, 32, 0, dev_ctx.stream()>>>( - beta1_, - beta2_, - beta1_pow.data(), - beta2_pow.data(), - dev_ctx.template Alloc(beta1_pow_out), - dev_ctx.template Alloc(beta2_pow_out)); - } - } -} - -} // namespace sr -} // namespace phi - -PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad, - GPU, - ALL_LAYOUT, - phi::sr::AdamwDenseParamSparseGradKernel, - float, - double, - phi::dtype::float16) {} diff --git a/paddle/phi/ops/compat/adam_sig.cc b/paddle/phi/ops/compat/adam_sig.cc deleted file mode 100644 index 0bc31cd28cb6c463a88e7a26ee3c63585fc9098f..0000000000000000000000000000000000000000 --- a/paddle/phi/ops/compat/adam_sig.cc +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include - -#include "paddle/phi/core/compat/op_utils.h" -#include "paddle/utils/small_vector.h" - -namespace phi { - -KernelSignature AdamOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::SmallVector in_names = {"Param", - "Grad", - "LearningRate", - "Moment1", - "Moment2", - "Beta1Pow", - "Beta2Pow", - "MasterParam", - "SkipUpdate"}; - paddle::SmallVector out_names = {"ParamOut", - "Moment1Out", - "Moment2Out", - "Beta1PowOut", - "Beta2PowOut", - "MasterParamOut"}; - paddle::SmallVector attr_names; - - attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor" - : "beta1"); - attr_names.emplace_back(ctx.HasInput("Beta2Tensor") ? "Beta2Tensor" - : "beta2"); - attr_names.emplace_back(ctx.HasInput("EpsilonTensor") ? "EpsilonTensor" - : "epsilon"); - attr_names.emplace_back("lazy_mode"); - attr_names.emplace_back("min_row_size_to_use_multithread"); - attr_names.emplace_back("multi_precision"); - attr_names.emplace_back("use_global_beta_pow"); - - if (ctx.IsSelectedRowsInput("Grad")) { - return KernelSignature("adam_dense_param_sparse_grad", - std::move(in_names), - std::move(attr_names), - std::move(out_names)); - } else if (ctx.IsDenseTensorInput("Grad")) { - return KernelSignature("adam", - std::move(in_names), - std::move(attr_names), - std::move(out_names)); - } else { - return KernelSignature("unregistered", {}, {}, {}); - } -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(adam, phi::AdamOpArgumentMapping); diff --git a/paddle/phi/ops/compat/adamw_sig.cc b/paddle/phi/ops/compat/adamw_sig.cc deleted file mode 100644 index 763304bdf3511561571b49153fd458643acb3a2b..0000000000000000000000000000000000000000 --- a/paddle/phi/ops/compat/adamw_sig.cc +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include - -#include "paddle/phi/core/compat/op_utils.h" -#include "paddle/utils/small_vector.h" - -namespace phi { - -KernelSignature AdamwOpArgumentMapping(const ArgumentMappingContext& ctx) { - paddle::SmallVector in_names = {"Param", - "Grad", - "LearningRate", - "Moment1", - "Moment2", - "Beta1Pow", - "Beta2Pow", - "MasterParam", - "SkipUpdate"}; - paddle::SmallVector out_names = {"ParamOut", - "Moment1Out", - "Moment2Out", - "Beta1PowOut", - "Beta2PowOut", - "MasterParamOut"}; - paddle::SmallVector attr_names; - - attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor" - : "beta1"); - attr_names.emplace_back(ctx.HasInput("Beta2Tensor") ? "Beta2Tensor" - : "beta2"); - attr_names.emplace_back(ctx.HasInput("EpsilonTensor") ? "EpsilonTensor" - : "epsilon"); - attr_names.emplace_back("lr_ratio"); - attr_names.emplace_back("coeff"); - attr_names.emplace_back("with_decay"); - attr_names.emplace_back("lazy_mode"); - attr_names.emplace_back("min_row_size_to_use_multithread"); - attr_names.emplace_back("multi_precision"); - attr_names.emplace_back("use_global_beta_pow"); - - if (ctx.IsSelectedRowsInput("Grad")) { - return KernelSignature("adamw_dense_param_sparse_grad", - std::move(in_names), - std::move(attr_names), - std::move(out_names)); - } else if (ctx.IsDenseTensorInput("Grad")) { - return KernelSignature("adamw", - std::move(in_names), - std::move(attr_names), - std::move(out_names)); - } else { - return KernelSignature("unregistered", {}, {}, {}); - } -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(adamw, phi::AdamwOpArgumentMapping); diff --git a/tools/infrt/get_compat_kernel_signature.py b/tools/infrt/get_compat_kernel_signature.py index a66a236b0f9759759f83aa4419cfb9cdcf9b3712..08ecb4041a8b68fdbb003ace23646ab11628b509 100644 --- a/tools/infrt/get_compat_kernel_signature.py +++ b/tools/infrt/get_compat_kernel_signature.py @@ -16,7 +16,7 @@ import os import re import json -skip_list = ["adam_sig.cc", "adamw_sig.cc"] +skip_list = [] def is_grad_kernel(kernel_info):