未验证 提交 ea0a164b 编写于 作者: C chentianyu03 提交者: GitHub

[Yaml]add adamw yaml (#41678)

* add adamw yaml

* fix test case error

* make the name of weight and bias in linear1 and linear2 to be constant
上级 1927aff9
...@@ -217,6 +217,199 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl( ...@@ -217,6 +217,199 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
////////////////// Forward api impls ////////////////////// ////////////////// Forward api impls //////////////////////
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
const Tensor& param,
const Tensor& grad,
const Tensor& learning_rate,
const Tensor& moment1,
const Tensor& moment2,
const Tensor& beta1_pow,
const Tensor& beta2_pow,
paddle::optional<const Tensor&> master_param,
paddle::optional<const Tensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
float lr_ratio,
float coeff,
bool with_decay,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow) {
Backend kernel_backend = Backend::UNDEFINED;
DataLayout kernel_layout = DataLayout::UNDEFINED;
DataType kernel_data_type = DataType::UNDEFINED;
if (kernel_backend == Backend::UNDEFINED ||
kernel_layout == DataLayout::UNDEFINED ||
kernel_data_type == DataType::UNDEFINED) {
auto kernel_key_set = ParseKernelKeyByInputArgs(param);
auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
if (kernel_backend == Backend::UNDEFINED) {
kernel_backend = kernel_key.backend();
}
if (kernel_layout == DataLayout::UNDEFINED) {
kernel_layout = kernel_key.layout();
}
if (kernel_data_type == DataType::UNDEFINED) {
kernel_data_type = kernel_key.dtype();
}
}
std::string kernel_name = "adamw";
const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
kernel_name, {kernel_backend, kernel_layout, kernel_data_type});
VLOG(6) << kernel_name << " API kernel key: [" << kernel_backend << ", "
<< kernel_layout << ", " << kernel_data_type << "]";
VLOG(6) << kernel_name << " API kernel: " << kernel;
auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
auto input_param = PrepareData(param, kernel.InputAt(0), {});
auto input_grad = PrepareData(grad, kernel.InputAt(1), {});
auto input_lr = PrepareData(learning_rate, kernel.InputAt(2), {});
auto input_moment1 = PrepareData(moment1, kernel.InputAt(3), {});
auto input_moment2 = PrepareData(moment2, kernel.InputAt(4), {});
auto input_beta1_pow = PrepareData(beta1_pow, kernel.InputAt(5), {});
auto input_beta2_pow = PrepareData(beta2_pow, kernel.InputAt(6), {});
paddle::optional<const phi::DenseTensor&> input_master_param(paddle::none);
auto input_master_param_ptr =
PrepareData(master_param, kernel.InputAt(7), {});
paddle::optional<const phi::DenseTensor&> input_skip_update(paddle::none);
auto input_skip_update_ptr = PrepareData(skip_update, kernel.InputAt(8), {});
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> api_output;
auto kernel_out_0 = input_param.get();
auto kernel_out_1 = input_moment1.get();
auto kernel_out_2 = input_moment2.get();
auto kernel_out_3 = input_beta1_pow.get();
auto kernel_out_4 = input_beta2_pow.get();
phi::DenseTensor* kernel_out_5 = nullptr;
if (input_master_param_ptr) {
input_master_param =
paddle::make_optional<const phi::DenseTensor&>(*input_master_param_ptr);
kernel_out_5 =
paddle::make_optional<phi::DenseTensor&>(*input_master_param_ptr)
.get_ptr();
}
if (input_skip_update_ptr) {
input_skip_update =
paddle::make_optional<const phi::DenseTensor&>(*input_skip_update_ptr);
}
paddle::optional<const phi::MetaTensor&> input_meta_ref_master_param(
paddle::none);
phi::DenseTensor dt;
phi::MetaTensor input_meta_tmp_master_param(dt);
if (input_master_param_ptr) {
input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype());
input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims());
input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout());
input_meta_ref_master_param = input_meta_tmp_master_param;
}
paddle::optional<const phi::MetaTensor&> input_meta_ref_skip_update(
paddle::none);
phi::DenseTensor dt1;
phi::MetaTensor input_meta_tmp_skip_update(dt1);
if (input_skip_update_ptr) {
input_meta_tmp_skip_update.set_dtype(input_skip_update_ptr->dtype());
input_meta_tmp_skip_update.set_dims(input_skip_update_ptr->dims());
input_meta_tmp_skip_update.set_layout(input_skip_update_ptr->layout());
input_meta_ref_skip_update = input_meta_tmp_skip_update;
}
phi::MetaTensor meta_out_0(kernel_out_0);
phi::MetaTensor meta_out_1(kernel_out_1);
phi::MetaTensor meta_out_2(kernel_out_2);
phi::MetaTensor meta_out_3(kernel_out_3);
phi::MetaTensor meta_out_4(kernel_out_4);
phi::MetaTensor meta_out_5(kernel_out_5);
phi::AdamwInferMeta(MakeMetaTensor(*input_param),
MakeMetaTensor(*input_grad),
MakeMetaTensor(*input_lr),
MakeMetaTensor(*input_moment1),
MakeMetaTensor(*input_moment2),
MakeMetaTensor(*input_beta1_pow),
MakeMetaTensor(*input_beta2_pow),
input_meta_ref_master_param,
input_meta_ref_skip_update,
beta1,
beta2,
epsilon,
lr_ratio,
coeff,
with_decay,
lazy_mode,
min_row_size_to_use_multithread,
multi_precision,
use_global_beta_pow,
&meta_out_0,
&meta_out_1,
&meta_out_2,
&meta_out_3,
&meta_out_4,
&meta_out_5);
using kernel_signature = void (*)(const platform::DeviceContext&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
paddle::optional<const phi::DenseTensor&>,
paddle::optional<const phi::DenseTensor&>,
const Scalar&,
const Scalar&,
const Scalar&,
float,
float,
bool,
bool,
int64_t,
bool,
bool,
phi::DenseTensor*,
phi::DenseTensor*,
phi::DenseTensor*,
phi::DenseTensor*,
phi::DenseTensor*,
phi::DenseTensor*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(*dev_ctx,
*input_param,
*input_grad,
*input_lr,
*input_moment1,
*input_moment2,
*input_beta1_pow,
*input_beta2_pow,
input_master_param,
input_skip_update,
beta1,
beta2,
epsilon,
lr_ratio,
coeff,
with_decay,
lazy_mode,
min_row_size_to_use_multithread,
multi_precision,
use_global_beta_pow,
kernel_out_0,
kernel_out_1,
kernel_out_2,
kernel_out_3,
kernel_out_4,
kernel_out_5);
return api_output;
}
Tensor conv2d_impl(const Tensor& input, Tensor conv2d_impl(const Tensor& input,
const Tensor& filter, const Tensor& filter,
const std::vector<int>& strides, const std::vector<int>& strides,
......
...@@ -49,6 +49,27 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl( ...@@ -49,6 +49,27 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
bool multi_precision, bool multi_precision,
bool use_global_beta_pow); bool use_global_beta_pow);
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adamw_impl(
const Tensor& param,
const Tensor& grad,
const Tensor& learning_rate,
const Tensor& moment1,
const Tensor& moment2,
const Tensor& beta1_pow,
const Tensor& beta2_pow,
paddle::optional<const Tensor&> master_param,
paddle::optional<const Tensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
float lr_ratio,
float coeff,
bool with_decay,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow);
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl( std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
const Tensor& x, const Tensor& x,
const Tensor& scale, const Tensor& scale,
......
...@@ -20,6 +20,7 @@ import paddle.fluid as fluid ...@@ -20,6 +20,7 @@ import paddle.fluid as fluid
from op_test import OpTest from op_test import OpTest
from functools import partial from functools import partial
from paddle.framework import core from paddle.framework import core
from paddle.fluid.framework import _test_eager_guard
def adamw_step(inputs, attributes): def adamw_step(inputs, attributes):
...@@ -238,6 +239,11 @@ class TestAdamWOp(unittest.TestCase): ...@@ -238,6 +239,11 @@ class TestAdamWOp(unittest.TestCase):
adam = paddle.optimizer.AdamW( adam = paddle.optimizer.AdamW(
0.1, epsilon=-1, parameters=linear.parameters()) 0.1, epsilon=-1, parameters=linear.parameters())
def test_api_eager_dygraph(self):
with _test_eager_guard():
self.test_adamw_op_dygraph()
self.test_adamw_op_invalid_input()
class TestAdamWOpGroup(TestAdamWOp): class TestAdamWOpGroup(TestAdamWOp):
def test_adamw_op_dygraph(self): def test_adamw_op_dygraph(self):
...@@ -319,6 +325,12 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp): ...@@ -319,6 +325,12 @@ class TestAdamWOpLayerwiseLR(TestAdamWOp):
linear1 = paddle.nn.Linear(13, 8) linear1 = paddle.nn.Linear(13, 8)
linear2 = paddle.nn.Linear(8, 5) linear2 = paddle.nn.Linear(8, 5)
# fix the linear name, simple_lr_setting function will use the name
linear1.weight.name = "linear_1.w_0"
linear1.bias.name = "linear_1.b_0"
linear2.weight.name = "linear_2.w_0"
linear2.bias.name = "linear_2.b_0"
simple_lr_fun = partial(simple_lr_setting, decay_rate=0.8, n_layers=2) simple_lr_fun = partial(simple_lr_setting, decay_rate=0.8, n_layers=2)
adam = paddle.optimizer.AdamW( adam = paddle.optimizer.AdamW(
......
...@@ -290,14 +290,24 @@ class AdamW(Adam): ...@@ -290,14 +290,24 @@ class AdamW(Adam):
_beta2 = self._beta2 if not isinstance( _beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0) self._beta2, Variable) else self._beta2.numpy().item(0)
_, _, _, _, _, _ = _C_ops.adamw( if framework.in_dygraph_mode():
param_and_grad[0], param_and_grad[1], lr, moment1, moment2, found_inf = self._get_auxiliary_var('found_inf')
beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0], _, _, _, _, _, _ = _C_ops.final_state_adamw(
moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode, beta1_pow_acc, beta2_pow_acc, master_weight, found_inf,
'min_row_size_to_use_multithread', 1000, 'beta1', _beta1, _beta1, _beta2, self._epsilon, lr_ratio_, self._coeff,
'beta2', _beta2, "with_decay", with_decay, 'coeff', self._coeff, with_decay, self._lazy_mode, 1000, find_master, False)
'multi_precision', find_master, 'lr_ratio', lr_ratio_) else:
_, _, _, _, _, _ = _C_ops.adamw(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, master_weight,
param_and_grad[0], moment1, moment2, beta1_pow_acc,
beta2_pow_acc, master_weight, 'epsilon', self._epsilon,
'lazy_mode', self._lazy_mode,
'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
'beta2', _beta2, "with_decay", with_decay, 'coeff',
self._coeff, 'multi_precision', find_master, 'lr_ratio',
lr_ratio_)
return None return None
inputs = { inputs = {
......
...@@ -58,6 +58,12 @@ ...@@ -58,6 +58,12 @@
func : AdamaxInferMeta func : AdamaxInferMeta
kernel : kernel :
func : adamax func : adamax
- api : adamw
args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, float lr_ratio, float coeff, bool with_decay, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
optional : master_param, skip_update
invoke : adamw_impl(param, grad, learning_rate, moment1, moment2, beta1_pow, beta2_pow, master_param, skip_update, beta1, beta2, epsilon, lr_ratio, coeff, with_decay, lazy_mode, min_row_size_to_use_multithread, multi_precision, use_global_beta_pow)
- api : add - api : add
args : (Tensor x, Tensor y) args : (Tensor x, Tensor y)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册