未验证 提交 183a74db 编写于 作者: R RedContritio 提交者: GitHub

support auto generate for op adam, adamw and merged_adam optimizer (#52711)

* support auto generate for op adam optimizer

* remove unnecessary files

* support auto generate for op adamw optimizer

* support auto generate for op merged_adam optimizer

* use manual_signature in adam_
上级 fe6abd4d
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/adam_op.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(adam,
AdamInferMetaFunctor,
PD_INFER_META(phi::AdamInferMeta));
REGISTER_OPERATOR(
adam,
ops::AdamOp,
ops::AdamOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
AdamInferMetaFunctor);
REGISTER_OP_VERSION(adam)
.AddCheckpoint(
R"ROC(
Upgrade adam add 1 attribute [multi_precision].
)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"multi_precision",
"(bool) Whether to use multi-precision during weight updating.",
false))
.AddCheckpoint(
R"ROC(
Upgrade adam, add 1 dispensable input [EpsilonTensor].
)ROC",
paddle::framework::compatible::OpVersionDesc().NewInput(
"EpsilonTensor",
"If provided, Adam will use this as epsilon, "
"this has a higher priority than attr(epsilon). "
"For better performance in npu kernel. "))
.AddCheckpoint(
R"ROC(
Upgrade adam, add 1 attribute [use_global_beta_pow].
)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"use_global_beta_pow",
"If true, Adam will use global beta_pow for whole model "
"instead of creating beta_pow for each parameter."
"In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be "
"used in adam op, "
"and beta_pow will be updated after all adam op in the model.",
false))
.AddCheckpoint(
R"ROC(
Upgrade adam, add 1 dispensable input [SkipUpdate].
)ROC",
paddle::framework::compatible::OpVersionDesc().NewInput(
"SkipUpdate", "If the value is true, Adam will skip the update."));
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
class AdamOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
auto input_data_type =
OperatorWithKernel::IndicateVarDataType(ctx, "Param");
return phi::KernelKey(input_data_type, ctx.GetPlace());
}
phi::KernelKey GetKernelTypeForVar(
const std::string &var_name,
const phi::DenseTensor &tensor,
const phi::KernelKey &expected_kernel_type) const {
if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
var_name == "SkipUpdate") {
return phi::KernelKey(phi::Backend::ALL_BACKEND,
expected_kernel_type.layout(),
expected_kernel_type.dtype());
} else {
return phi::KernelKey(
tensor.place(), tensor.layout(), expected_kernel_type.dtype());
}
}
};
class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Param", "(Tensor) Input parameter");
AddInput("Grad", "(Tensor) Input gradient");
AddInput("LearningRate", "(Tensor) Learning rate");
AddInput("Moment1", "(Tensor) Input first moment");
AddInput("Moment2", "(Tensor) Input second moment");
AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
AddInput("Beta1Tensor",
"(Tensor<float32>, optional) If provided, Adam will use this "
"as beta1, this has a higher priority than attr(beta1), the "
"shape of this tensor MUST BE [1].")
.AsDispensable();
AddInput("Beta2Tensor",
"(Tensor<float32>, optional) If provided, Adam will use this "
"as beta2, this has a higher priority than attr(beta2), the "
"shape of this tensor MUST BE [1].")
.AsDispensable();
AddInput("EpsilonTensor",
"(Tensor<float32>, optional) If provided, Adam will use this "
"as epsilon, this has a higher priority than attr(epsilon), the "
"shape of this tensor MUST BE [1].")
.AsDispensable();
AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
AddInput("SkipUpdate", "(Tensor<bool>, optional), Skip the update or not.")
.AsDispensable();
AddOutput("ParamOut", "(Tensor) Output parameter");
AddOutput("Moment1Out", "(Tensor) Output first moment");
AddOutput("Moment2Out", "(Tensor) Output second moment");
AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator");
AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator");
AddOutput("MasterParamOut",
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParam).")
.AsDispensable();
AddAttr<float>("beta1",
"(float, default 0.9) "
"Exponential decay rate for the "
"first moment estimates.")
.SetDefault(0.9f);
AddAttr<float>("beta2",
"(float, default 0.999) "
"exponential decay rate for the "
"second moment estimates.")
.SetDefault(0.999f);
AddAttr<float>("epsilon",
"(float, default 1.0e-8) "
"Constant for numerical stability")
.SetDefault(1.0e-8f);
AddAttr<bool>(
"lazy_mode",
"(bool, default false) "
"only update the parameter that has gradient in sparse update")
.SetDefault(false);
AddAttr<int64_t>("min_row_size_to_use_multithread",
"(int64_t, default 0) "
"when not zero, if param row size is larger then "
"min_row_size_to_use_multithread and "
"inner_op_parallelism is larger then 0, sparse update "
"will run in multithread mode")
.SetDefault(1000);
AddAttr<bool>("multi_precision",
"(bool, default false) "
"Whether to use multi-precision during weight updating.")
.SetDefault(false);
// TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
// as dispensable since they are not used when use_global_beta_pow is true.
AddAttr<bool>("use_global_beta_pow",
"(bool, default false) "
"Whether to use global beta_pow for whole model instead of "
"creating beta_pow for each parameter.")
.SetDefault(false);
AddComment(R"DOC(
Adam Optimizer.
This implements the Adam optimizer from Section 2 of the Adam
paper : https://arxiv.org/abs/1412.6980.
Adam is a first-order gradient-based optimization method based on
adaptive estimates of lower-order moments.
Adam updates:
$$
moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
learning\_rate = learning\_rate *
\frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/kernels/funcs/selected_rows_functor.h"
namespace paddle {
namespace operators {
namespace scatter = phi::funcs::scatter;
static inline float GetAttrFromTensor(const phi::DenseTensor* tensor) {
const float* tensor_data = tensor->data<float>();
phi::DenseTensor cpu_tensor;
if (platform::is_gpu_place(tensor->place())) {
paddle::framework::TensorCopySync(
*tensor, platform::CPUPlace(), &cpu_tensor);
tensor_data = cpu_tensor.data<float>();
}
if (platform::is_xpu_place(tensor->place())) {
paddle::framework::TensorCopySync(
*tensor, platform::CPUPlace(), &cpu_tensor);
tensor_data = cpu_tensor.data<float>();
}
return tensor_data[0];
}
} // namespace operators
} // namespace paddle
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/operators/optimizers/adam_op.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
class AdamWOp : public AdamOp {
using AdamOp::AdamOp;
};
class AdamWOpMaker : public AdamOpMaker {
public:
void Make() {
AdamOpMaker::Make();
AddAttr<float>("lr_ratio",
"(float, default 1.0) "
"layerwise learning rate decay")
.SetDefault(1.0f);
AddAttr<float>("coeff",
"(float, default 0.01) "
"coeff of the weight decay")
.SetDefault(0.01f);
AddAttr<bool>("with_decay",
"(bool, default false) "
"whether to do weight decay")
.SetDefault(false);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(adamw,
AdamwInferMetaFunctor,
PD_INFER_META(phi::AdamwInferMeta));
REGISTER_OPERATOR(
adamw,
ops::AdamWOp,
ops::AdamWOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
AdamwInferMetaFunctor);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
class MergedAdamOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
phi::KernelKey GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
auto param_dtype =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "Param");
return phi::KernelKey(param_dtype, ctx.GetPlace());
}
phi::KernelKey GetKernelTypeForVar(
const std::string& var_name,
const phi::DenseTensor& tensor,
const phi::KernelKey& expected_kernel_type) const override {
if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
var_name == "SkipUpdate") {
return phi::KernelKey(phi::Backend::ALL_BACKEND,
expected_kernel_type.layout(),
expected_kernel_type.dtype());
} else {
return phi::KernelKey(
tensor.place(), tensor.layout(), expected_kernel_type.dtype());
}
}
};
class MergedAdamOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Param", "(Tensor, default Tensor<float>) Input parameter")
.AsDuplicable();
AddInput("Grad", "(Tensor, default Tensor<float>) Input gradient")
.AsDuplicable();
AddInput("LearningRate", "(Tensor, default Tensor<float>) Learning rate")
.AsDuplicable();
AddInput("Moment1", "(Tensor, default Tensor<float>) Input first moment")
.AsDuplicable();
AddInput("Moment2", "(Tensor, default Tensor<float>) Input second moment")
.AsDuplicable();
AddInput("Beta1Pow",
"(Tensor, default Tensor<float>) Input beta1 power accumulator")
.AsDuplicable();
AddInput("Beta2Pow",
"(Tensor, default Tensor<float>) Input beta2 power accumulator")
.AsDuplicable();
AddInput("MasterParam", "FP32 master weight for AMP.")
.AsDispensable()
.AsDuplicable();
AddOutput("ParamOut", "(Tensor) Output parameter").AsDuplicable();
AddOutput("Moment1Out", "(Tensor) Output first moment").AsDuplicable();
AddOutput("Moment2Out", "(Tensor) Output second moment").AsDuplicable();
AddOutput("Beta1PowOut", "(Tensor) Output beta1 power accumulator")
.AsDuplicable();
AddOutput("Beta2PowOut", "(Tensor) Output beta2 power accumulator")
.AsDuplicable();
AddOutput("MasterParamOut",
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParam).")
.AsDispensable()
.AsDuplicable();
AddAttr<float>("beta1",
"(float, default 0.9) "
"Exponential decay rate for the "
"first moment estimates.")
.SetDefault(0.9f);
AddAttr<float>("beta2",
"(float, default 0.999) "
"exponential decay rate for the "
"second moment estimates.")
.SetDefault(0.999f);
AddAttr<float>("epsilon",
"(float, default 1.0e-8) "
"Constant for numerical stability")
.SetDefault(1.0e-8f);
AddAttr<bool>("multi_precision",
"(bool, default false) "
"Whether to use multi-precision during weight updating.")
.SetDefault(false);
// TODO(zhiqiu): We could set Beta1PowOut and Beta2PowOut
// as dispensable since they are not used when use_global_beta_pow is true.
AddAttr<bool>("use_global_beta_pow",
"(bool, default false) "
"Whether to use global beta_pow for whole model instead of "
"creating beta_pow for each parameter.")
.SetDefault(false);
AddComment(R"DOC(
Adam Optimizer.
This implements the Adam optimizer from Section 2 of the Adam
paper : https://arxiv.org/abs/1412.6980.
Adam is a first-order gradient-based optimization method based on
adaptive estimates of lower-order moments.
Adam updates:
$$
moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
learning\_rate = learning\_rate *
\frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
$$
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
DECLARE_INFER_SHAPE_FUNCTOR(merged_adam,
MergedAdamInferMetaFunctor,
PD_INFER_META(phi::MergedAdamInferMeta));
REGISTER_OPERATOR(
merged_adam,
ops::MergedAdamOp,
ops::MergedAdamOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
MergedAdamInferMetaFunctor);
......@@ -9,7 +9,6 @@ register_unity_group(
ftrl_op.cc
lars_momentum_op.cc
proximal_adagrad_op.cc
adam_op.cc
proximal_gd_op.cc
decayed_adagrad_op.cc
adadelta_op.cc
......@@ -22,7 +21,6 @@ register_unity_group(
sgd_op.cu
proximal_adagrad_op.cu
adagrad_op.cu
adam_op.cu
decayed_adagrad_op.cu
adadelta_op.cu
lamb_op.cu)
......@@ -21,29 +21,6 @@
optional : master_param
inplace : (param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out), (master_param -> master_param_out)
- op : adam_
args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
infer_meta :
func : AdamInferMeta
kernel :
func : adam {dense, dense, dense, dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense, dense, dense},
adam_dense_param_sparse_grad {dense, selected_rows, dense, dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense, dense, dense}
data_type : param
optional : master_param, skip_update
inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs)
- op : adamw_
args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, float lr_ratio, float coeff, bool with_decay, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
infer_meta :
func : AdamwInferMeta
kernel :
func : adamw
data_type : param
optional : master_param, skip_update
inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs)
- op : add
args : (Tensor x, Tensor y)
output : Tensor(out)
......@@ -816,17 +793,6 @@
func : mean
backward : mean_grad
- op : merged_adam_
args : (Tensor[] param, Tensor[] grad, Tensor[] learning_rate, Tensor[] moment1, Tensor[] moment2, Tensor[] beta1_pow, Tensor[] beta2_pow, Tensor[] master_param, Scalar beta1, Scalar beta2, Scalar epsilon, bool multi_precision, bool use_global_beta_pow)
output : Tensor[](param_out){param.size()}, Tensor[](moment1_out){param.size()}, Tensor[](moment2_out){param.size()}, Tensor[](beta1_pow_out){param.size()}, Tensor[](beta2_pow_out){param.size()}, Tensor[](master_param_out){param.size()}
infer_meta :
func : MergedAdamInferMeta
optional: master_param
kernel :
func : merged_adam
data_type : param
inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
- op : min
args : (Tensor x, IntArray axis={}, bool keepdim=false)
output : Tensor(out)
......
......@@ -50,12 +50,45 @@
outputs :
{ param_out : ParamOut, moment_out : MomentOut, master_param_out : MasterParamOut }
- op : adam_
inputs :
{param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam, skip_update: SkipUpdate}
outputs :
{param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
scalar :
beta1 :
data_type : float
tensor_name : Beta1Tensor
beta2 :
data_type : float
tensor_name : Beta2Tensor
episilon :
data_type : float
tensor_name : EpisilonTensor
manual_signature : [adam_]
- op : adamax_
inputs :
{param : Param, grad: Grad, learning_rate : LearningRate, moment : Moment, inf_norm : InfNorm, beta1_pow : Beta1Pow, master_param : MasterParam}
outputs :
{param_out : ParamOut, moment_out : MomentOut, inf_norm_out : InfNormOut, master_param_out : MasterParamOut}
- op : adamw_
inputs :
{param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam, skip_update: SkipUpdate}
outputs :
{param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
scalar :
beta1 :
data_type : float
tensor_name : Beta1Tensor
beta2 :
data_type : float
tensor_name : Beta2Tensor
episilon :
data_type : float
tensor_name : EpisilonTensor
- op : add (elementwise_add)
backward : add_grad (elementwise_add_grad)
extra :
......@@ -1454,6 +1487,22 @@
outputs :
out : Out
- op : merged_adam_
inputs :
{param: Param, grad: Grad, learning_rate: LearningRate, moment1: Moment1, moment2: Moment2, beta1_pow: Beta1Pow, beta2_pow: Beta2Pow, master_param: MasterParam}
outputs :
{param_out: ParamOut, moment1_out: Moment1Out, moment2_out: Moment2Out, beta1_pow_out: Beta1PowOut, beta2_pow_out: Beta2PowOut, master_param_out: MasterParamOut}
scalar :
beta1 :
data_type : float
support_tensor : true
beta2 :
data_type : float
support_tensor : true
epsilon :
data_type : float
support_tensor : true
- op : merged_momentum_
inputs :
{param : Param, grad : Grad, velocity : Velocity, learning_rate : LearningRate, master_param : MasterParam}
......
- op : adam_
version :
- checkpoint : Upgrade adam add 1 attribute [multi_precision].
action :
- add_attr : multi_precision
comment : (bool) Whether to use multi-precision during weight updating.
default : "false"
- checkpoint : Upgrade adam, add 1 dispensable input [EpsilonTensor].
action :
- add_input : EpsilonTensor
comment : If provided, Adam will use this as epsilon, this has a higher priority than attr(epsilon). For better performance in npu kernel.
- checkpoint : Upgrade adam, add 1 attribute [use_global_beta_pow].
action :
- add_attr : use_global_beta_pow
comment : If true, Adam will use global beta_pow for whole model instead of creating beta_pow for each parameter. In that case, the outputs(Beta1PowOut, Beta2PowOut) will not be used in adam op, and beta_pow will be updated after all adam op in the model.
default : "false"
- checkpoint : Upgrade adam, add 1 dispensable input [SkipUpdate].
action :
- add_input : SkipUpdate
comment : If the value is true, Adam will skip the update.
- op : affine_grid
version :
- checkpoint : Compatible upgrade of affine_grid, add a new attribute [align_corners].
......
......@@ -44,6 +44,18 @@
optional : master_param, master_param_out
inplace : (param -> param_out), (moment -> moment_out), (master_param -> master_param_out)
- op : adam_
args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1 = 0.9f, Scalar beta2 = 0.999f, Scalar epsilon = 1.0e-8f, bool lazy_mode = false, int64_t min_row_size_to_use_multithread = 1000, bool multi_precision = false, bool use_global_beta_pow = false)
output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_out)
infer_meta :
func : AdamInferMeta
kernel :
func : adam {dense, dense, dense, dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense, dense, dense},
adam_dense_param_sparse_grad {dense, selected_rows, dense, dense, dense, dense, dense, dense, dense -> dense, dense, dense, dense, dense, dense}
data_type : param
optional : master_param, skip_update, master_param_out
inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
- op : adamax_
args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, Tensor master_param, float beta1 = 0.9f, float beta2 = 0.999f, float epsilon = 1.0e-8f, bool multi_precision = false)
output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out), Tensor(master_param_out)
......@@ -55,6 +67,17 @@
optional : master_param, master_param_out
inplace : (param -> param_out), (moment -> moment_out), (inf_norm -> inf_norm_out), (master_param ->master_param_out)
- op : adamw_
args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1 = 0.9f, Scalar beta2 = 0.999f, Scalar epsilon = 1.0e-8f, float lr_ratio = 1.0f, float coeff = 0.01f, bool with_decay = false, bool lazy_mode = false, int64_t min_row_size_to_use_multithread = 1000, bool multi_precision = false, bool use_global_beta_pow = false)
output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_out)
infer_meta :
func : AdamwInferMeta
kernel :
func : adamw
data_type : param
optional : master_param, skip_update, master_param_out
inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
- op : addmm
args : (Tensor input, Tensor x, Tensor y, float beta=1.0, float alpha=1.0)
output : Tensor
......@@ -1257,6 +1280,17 @@
kernel :
func : merge_selected_rows {selected_rows -> selected_rows}
- op : merged_adam_
args : (Tensor[] param, Tensor[] grad, Tensor[] learning_rate, Tensor[] moment1, Tensor[] moment2, Tensor[] beta1_pow, Tensor[] beta2_pow, Tensor[] master_param, Scalar beta1 = 0.9f, Scalar beta2 = 0.999f, Scalar epsilon = 1.0e-8f, bool multi_precision = false, bool use_global_beta_pow = false)
output : Tensor[](param_out){param.size()}, Tensor[](moment1_out){param.size()}, Tensor[](moment2_out){param.size()}, Tensor[](beta1_pow_out){param.size()}, Tensor[](beta2_pow_out){param.size()}, Tensor[](master_param_out){param.size()}
infer_meta :
func : MergedAdamInferMeta
kernel :
func : merged_adam
data_type : param
optional: master_param, master_param_out
inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
- op : merged_momentum_
args : (Tensor[] param, Tensor[] grad, Tensor[] velocity, Tensor[] learning_rate, Tensor[] master_param, float mu, bool use_nesterov = false, str[] regularization_method = {}, float[] regularization_coeff = {}, bool multi_precision = false, float rescale_grad = 1.0f)
output : Tensor[](param_out){param.size()}, Tensor[](velocity_out){param.size()}, Tensor[](master_param_out){param.size()}
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/phi/core/compat/op_utils.h"
#include "paddle/utils/small_vector.h"
namespace phi {
KernelSignature AdamwOpArgumentMapping(const ArgumentMappingContext& ctx) {
paddle::small_vector<const char*> in_names = {"Param",
"Grad",
"LearningRate",
"Moment1",
"Moment2",
"Beta1Pow",
"Beta2Pow",
"MasterParam",
"SkipUpdate"};
paddle::small_vector<const char*> out_names = {"ParamOut",
"Moment1Out",
"Moment2Out",
"Beta1PowOut",
"Beta2PowOut",
"MasterParamOut"};
paddle::small_vector<const char*> attr_names;
attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor"
: "beta1");
attr_names.emplace_back(ctx.HasInput("Beta2Tensor") ? "Beta2Tensor"
: "beta2");
attr_names.emplace_back(ctx.HasInput("EpsilonTensor") ? "EpsilonTensor"
: "epsilon");
attr_names.emplace_back("lr_ratio");
attr_names.emplace_back("coeff");
attr_names.emplace_back("with_decay");
attr_names.emplace_back("lazy_mode");
attr_names.emplace_back("min_row_size_to_use_multithread");
attr_names.emplace_back("multi_precision");
attr_names.emplace_back("use_global_beta_pow");
if (ctx.IsSelectedRowsInput("Grad")) {
return KernelSignature("adamw_dense_param_sparse_grad",
std::move(in_names),
std::move(attr_names),
std::move(out_names));
} else if (ctx.IsDenseTensorInput("Grad")) {
return KernelSignature("adamw",
std::move(in_names),
std::move(attr_names),
std::move(out_names));
} else {
return KernelSignature("unregistered", {}, {}, {});
}
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(adamw, phi::AdamwOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/phi/core/compat/op_utils.h"
#include "paddle/utils/small_vector.h"
namespace phi {
KernelSignature MergedAdamOpArgumentMapping(const ArgumentMappingContext& ctx) {
paddle::small_vector<const char*> in_names = {"Param",
"Grad",
"LearningRate",
"Moment1",
"Moment2",
"Beta1Pow",
"Beta2Pow",
"MasterParam"};
paddle::small_vector<const char*> out_names = {"ParamOut",
"Moment1Out",
"Moment2Out",
"Beta1PowOut",
"Beta2PowOut",
"MasterParamOut"};
paddle::small_vector<const char*> attr_names = {
"beta1", "beta2", "epsilon", "multi_precision", "use_global_beta_pow"};
return KernelSignature("merged_adam",
std::move(in_names),
std::move(attr_names),
std::move(out_names));
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(merged_adam, phi::MergedAdamOpArgumentMapping);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册