未验证 提交 8cbf79a3 编写于 作者: C chentianyu03 提交者: GitHub

[Yaml]Add adam yaml (#41561)

* add adam yaml

* add adam final_state api

* add adam_impl
上级 a4d4c116
...@@ -33,6 +33,187 @@ limitations under the License. */ ...@@ -33,6 +33,187 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace experimental { namespace experimental {
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
const Tensor& param,
const Tensor& grad,
const Tensor& learning_rate,
const Tensor& moment1,
const Tensor& moment2,
const Tensor& beta1_pow,
const Tensor& beta2_pow,
paddle::optional<const Tensor&> master_param,
paddle::optional<const Tensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow) {
Backend kernel_backend = Backend::UNDEFINED;
DataLayout kernel_layout = DataLayout::UNDEFINED;
DataType kernel_data_type = DataType::UNDEFINED;
if (kernel_backend == Backend::UNDEFINED ||
kernel_layout == DataLayout::UNDEFINED ||
kernel_data_type == DataType::UNDEFINED) {
auto kernel_key_set = ParseKernelKeyByInputArgs(param);
auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
if (kernel_backend == Backend::UNDEFINED) {
kernel_backend = kernel_key.backend();
}
if (kernel_layout == DataLayout::UNDEFINED) {
kernel_layout = kernel_key.layout();
}
if (kernel_data_type == DataType::UNDEFINED) {
kernel_data_type = kernel_key.dtype();
}
}
std::string kernel_name = "adam";
const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
kernel_name, {kernel_backend, kernel_layout, kernel_data_type});
VLOG(6) << kernel_name << " API kernel key: [" << kernel_backend << ", "
<< kernel_layout << ", " << kernel_data_type << "]";
VLOG(6) << kernel_name << " API kernel: " << kernel;
auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
auto input_param = PrepareData(param, kernel.InputAt(0), {});
auto input_grad = PrepareData(grad, kernel.InputAt(1), {});
auto input_lr = PrepareData(learning_rate, kernel.InputAt(2), {});
auto input_moment1 = PrepareData(moment1, kernel.InputAt(3), {});
auto input_moment2 = PrepareData(moment2, kernel.InputAt(4), {});
auto input_beta1_pow = PrepareData(beta1_pow, kernel.InputAt(5), {});
auto input_beta2_pow = PrepareData(beta2_pow, kernel.InputAt(6), {});
paddle::optional<const phi::DenseTensor&> input_master_param(paddle::none);
auto input_master_param_ptr =
PrepareData(master_param, kernel.InputAt(7), {});
paddle::optional<const phi::DenseTensor&> input_skip_update(paddle::none);
auto input_skip_update_ptr = PrepareData(skip_update, kernel.InputAt(8), {});
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> api_output;
auto kernel_out_0 = input_param.get();
auto kernel_out_1 = input_moment1.get();
auto kernel_out_2 = input_moment2.get();
auto kernel_out_3 = input_beta1_pow.get();
auto kernel_out_4 = input_beta2_pow.get();
phi::DenseTensor* kernel_out_5 = nullptr;
if (input_master_param_ptr) {
input_master_param =
paddle::make_optional<const phi::DenseTensor&>(*input_master_param_ptr);
kernel_out_5 =
paddle::make_optional<phi::DenseTensor&>(*input_master_param_ptr)
.get_ptr();
}
if (input_skip_update_ptr) {
input_skip_update =
paddle::make_optional<const phi::DenseTensor&>(*input_skip_update_ptr);
}
paddle::optional<const phi::MetaTensor&> input_meta_ref_master_param(
paddle::none);
phi::DenseTensor dt;
phi::MetaTensor input_meta_tmp_master_param(dt);
if (input_master_param_ptr) {
input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype());
input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims());
input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout());
input_meta_ref_master_param = input_meta_tmp_master_param;
}
paddle::optional<const phi::MetaTensor&> input_meta_ref_skip_update(
paddle::none);
phi::DenseTensor dt1;
phi::MetaTensor input_meta_tmp_skip_update(dt1);
if (input_skip_update_ptr) {
input_meta_tmp_skip_update.set_dtype(input_skip_update_ptr->dtype());
input_meta_tmp_skip_update.set_dims(input_skip_update_ptr->dims());
input_meta_tmp_skip_update.set_layout(input_skip_update_ptr->layout());
input_meta_ref_skip_update = input_meta_tmp_skip_update;
}
phi::MetaTensor meta_out_0(kernel_out_0);
phi::MetaTensor meta_out_1(kernel_out_1);
phi::MetaTensor meta_out_2(kernel_out_2);
phi::MetaTensor meta_out_3(kernel_out_3);
phi::MetaTensor meta_out_4(kernel_out_4);
phi::MetaTensor meta_out_5(kernel_out_5);
phi::AdamInferMeta(MakeMetaTensor(*input_param),
MakeMetaTensor(*input_grad),
MakeMetaTensor(*input_lr),
MakeMetaTensor(*input_moment1),
MakeMetaTensor(*input_moment2),
MakeMetaTensor(*input_beta1_pow),
MakeMetaTensor(*input_beta2_pow),
input_meta_ref_master_param,
input_meta_ref_skip_update,
beta1,
beta2,
epsilon,
lazy_mode,
min_row_size_to_use_multithread,
multi_precision,
use_global_beta_pow,
&meta_out_0,
&meta_out_1,
&meta_out_2,
&meta_out_3,
&meta_out_4,
&meta_out_5);
using kernel_signature = void (*)(const platform::DeviceContext&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
const phi::DenseTensor&,
paddle::optional<const phi::DenseTensor&>,
paddle::optional<const phi::DenseTensor&>,
const Scalar&,
const Scalar&,
const Scalar&,
bool,
int64_t,
bool,
bool,
phi::DenseTensor*,
phi::DenseTensor*,
phi::DenseTensor*,
phi::DenseTensor*,
phi::DenseTensor*,
phi::DenseTensor*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(*dev_ctx,
*input_param,
*input_grad,
*input_lr,
*input_moment1,
*input_moment2,
*input_beta1_pow,
*input_beta2_pow,
input_master_param,
input_skip_update,
beta1,
beta2,
epsilon,
lazy_mode,
min_row_size_to_use_multithread,
multi_precision,
use_global_beta_pow,
kernel_out_0,
kernel_out_1,
kernel_out_2,
kernel_out_3,
kernel_out_4,
kernel_out_5);
return api_output;
}
////////////////// Forward api impls ////////////////////// ////////////////// Forward api impls //////////////////////
Tensor conv2d_impl(const Tensor& input, Tensor conv2d_impl(const Tensor& input,
......
...@@ -30,6 +30,24 @@ namespace experimental { ...@@ -30,6 +30,24 @@ namespace experimental {
////////////////// Forward api impls ////////////////////// ////////////////// Forward api impls //////////////////////
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> adam_impl(
const Tensor& param,
const Tensor& grad,
const Tensor& learning_rate,
const Tensor& moment1,
const Tensor& moment2,
const Tensor& beta1_pow,
const Tensor& beta2_pow,
paddle::optional<const Tensor&> master_param,
paddle::optional<const Tensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow);
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl( std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
const Tensor& x, const Tensor& x,
const Tensor& scale, const Tensor& scale,
......
...@@ -21,6 +21,7 @@ from paddle.fluid import core ...@@ -21,6 +21,7 @@ from paddle.fluid import core
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle import paddle
from paddle.fluid.framework import _test_eager_guard
class TestAdamOp1(OpTest): class TestAdamOp1(OpTest):
...@@ -189,6 +190,10 @@ class TestAdamOpMultipleSteps(OpTest): ...@@ -189,6 +190,10 @@ class TestAdamOpMultipleSteps(OpTest):
self.inputs['Grad'] = np.random.uniform( self.inputs['Grad'] = np.random.uniform(
-1, 1, (102, 105)).astype("float32") -1, 1, (102, 105)).astype("float32")
def test_api_eager_dygraph(self):
with _test_eager_guard():
self.test_check_output()
def adam_step(inputs, attributes): def adam_step(inputs, attributes):
''' '''
...@@ -732,6 +737,14 @@ class TestAdamOpV2(unittest.TestCase): ...@@ -732,6 +737,14 @@ class TestAdamOpV2(unittest.TestCase):
adam.step() adam.step()
paddle.enable_static() paddle.enable_static()
def test_api_eager_dygraph(self):
with _test_eager_guard():
self.test_adam_op_dygraph()
self.test_adam_op_with_state_dict()
self.test_adam_with_grad_clip()
self.test_adam_op_with_set_lr()
self.test_adam_op_with_sparse_input_and_weight_decay()
class TestAdamOptimizer(unittest.TestCase): class TestAdamOptimizer(unittest.TestCase):
def _test(self, def _test(self,
......
...@@ -24,6 +24,7 @@ import paddle.compat as cpt ...@@ -24,6 +24,7 @@ import paddle.compat as cpt
import numpy as np import numpy as np
from paddle.fluid.backward import append_backward from paddle.fluid.backward import append_backward
from paddle.fluid.framework import Program, program_guard, convert_np_dtype_to_dtype_ from paddle.fluid.framework import Program, program_guard, convert_np_dtype_to_dtype_
from paddle.fluid.framework import _test_eager_guard
import paddle import paddle
from paddle.io import Dataset from paddle.io import Dataset
import numpy import numpy
...@@ -1114,6 +1115,11 @@ class TestOptimizerDtype(unittest.TestCase): ...@@ -1114,6 +1115,11 @@ class TestOptimizerDtype(unittest.TestCase):
def test_float32(self): def test_float32(self):
self.check_with_dtype('float32') self.check_with_dtype('float32')
def test_api_eager_dygraph(self):
with _test_eager_guard():
self.test_float64()
self.test_float32()
class TestMasterWeightSaveForFP16(unittest.TestCase): class TestMasterWeightSaveForFP16(unittest.TestCase):
''' '''
......
...@@ -336,7 +336,23 @@ class Adam(Optimizer): ...@@ -336,7 +336,23 @@ class Adam(Optimizer):
lr = self._create_param_lr(param_and_grad) lr = self._create_param_lr(param_and_grad)
# create the adam optimize op # create the adam optimize op
if framework._non_static_mode(): if framework.in_dygraph_mode():
found_inf = self._get_auxiliary_var('found_inf')
_beta1 = self._beta1 if not isinstance(
self._beta1, Variable) else self._beta1.numpy().item(0)
_beta2 = self._beta2 if not isinstance(
self._beta2, Variable) else self._beta2.numpy().item(0)
_, _, _, _, _, _ = _C_ops.final_state_adam(
param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
beta1_pow_acc, beta2_pow_acc, master_weight, found_inf, _beta1,
_beta2, self._epsilon, self._lazy_mode, 1000, find_master,
False)
return None
if framework._in_legacy_dygraph():
_beta1 = self._beta1 if not isinstance( _beta1 = self._beta1 if not isinstance(
self._beta1, Variable) else self._beta1.numpy().item(0) self._beta1, Variable) else self._beta1.numpy().item(0)
......
...@@ -45,6 +45,12 @@ ...@@ -45,6 +45,12 @@
kernel : kernel :
func : adadelta func : adadelta
- api : adam
args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
optional : master_param, skip_update
invoke : adam_impl(param, grad, learning_rate, moment1, moment2, beta1_pow, beta2_pow, master_param, skip_update, beta1, beta2, epsilon, lazy_mode, min_row_size_to_use_multithread, multi_precision, use_global_beta_pow)
- api : adamax - api : adamax
args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, float beta1, float beta2, float epsilon) args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, float beta1, float beta2, float epsilon)
output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out) output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册