From fac7fd42e940a029660d99d145ec23a610cf7772 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Tue, 5 Apr 2022 21:57:59 +0800 Subject: [PATCH] [Phi]Add mean/momentum yaml (#41319) * move yaml * add momentum yaml * delete code * delete some code * add meshgrid backward * delete code * fix compile bugs --- paddle/phi/api/lib/api_custom_impl.cc | 143 ++++++++++++++++++ paddle/phi/api/lib/api_custom_impl.h | 14 ++ paddle/phi/infermeta/multiary.cc | 47 ++++++ paddle/phi/infermeta/multiary.h | 15 ++ python/paddle/fluid/layers/nn.py | 4 +- .../fluid/tests/unittests/test_mean_op.py | 10 +- .../fluid/tests/unittests/test_momentum_op.py | 10 ++ python/paddle/optimizer/momentum.py | 12 +- python/paddle/utils/code_gen/api.yaml | 15 ++ python/paddle/utils/code_gen/backward.yaml | 10 ++ 10 files changed, 272 insertions(+), 8 deletions(-) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index b816204c1a3..46d09c29bc0 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -123,6 +123,149 @@ std::vector split_impl(const Tensor& x, return out; } +std::tuple momentum_impl( + const Tensor& param, + const Tensor& grad, + const Tensor& velocity, + const Tensor& learning_rate, + paddle::optional master_param, + float mu, + bool use_nesterov, + const std::string& regularization_method, + float regularization_coeff, + bool multi_precision, + float rescale_grad) { + Backend kernel_backend = Backend::UNDEFINED; + DataLayout kernel_layout = DataLayout::UNDEFINED; + DataType kernel_data_type = DataType::UNDEFINED; + if (kernel_backend == Backend::UNDEFINED || + kernel_layout == DataLayout::UNDEFINED || + kernel_data_type == DataType::UNDEFINED) { + auto kernel_key_set = ParseKernelKeyByInputArgs(param); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + if (kernel_backend == Backend::UNDEFINED) { + kernel_backend = kernel_key.backend(); + } + if (kernel_layout == DataLayout::UNDEFINED) { + kernel_layout = kernel_key.layout(); + } + if (kernel_data_type == DataType::UNDEFINED) { + kernel_data_type = kernel_key.dtype(); + } + } + std::string kernel_name = "momentum"; + if (grad.is_selected_rows()) { + kernel_name = "momentum_dense_param_sparse_grad"; + } + const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + kernel_name, {kernel_backend, kernel_layout, kernel_data_type}); + VLOG(6) << kernel_name << " API kernel key: [" << kernel_backend << ", " + << kernel_layout << ", " << kernel_data_type << "]"; + VLOG(6) << kernel_name << " API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_backend); + + auto input_param = PrepareData(param, kernel.InputAt(0), {}); + auto input_grad = PrepareData(grad, kernel.InputAt(1), {}); + auto input_velocity = PrepareData(velocity, kernel.InputAt(2), {}); + auto input_learning_rate = PrepareData(learning_rate, kernel.InputAt(3), {}); + paddle::optional input_master_param(paddle::none); + auto input_master_param_ptr = + PrepareData(master_param, kernel.InputAt(4), {}); + + std::tuple api_output; + auto kernel_out_0 = input_param.get(); + auto kernel_out_1 = input_velocity.get(); + phi::DenseTensor* kernel_out_2 = nullptr; + if (input_master_param_ptr) { + input_master_param = + paddle::make_optional(*input_master_param_ptr); + kernel_out_2 = + paddle::make_optional(*input_master_param_ptr) + .get_ptr(); + } + + paddle::optional input_meta_ref_master_param( + paddle::none); + phi::DenseTensor dt; + phi::MetaTensor input_meta_tmp_master_param(dt); + if (input_master_param_ptr) { + input_meta_tmp_master_param.set_dtype(input_master_param_ptr->dtype()); + input_meta_tmp_master_param.set_dims(input_master_param_ptr->dims()); + input_meta_tmp_master_param.set_layout(input_master_param_ptr->layout()); + input_meta_ref_master_param = input_meta_tmp_master_param; + } + phi::MetaTensor meta_out_0(kernel_out_0); + phi::MetaTensor meta_out_1(kernel_out_1); + if (kernel_out_2) { + phi::MetaTensor meta_out_2(kernel_out_2); + phi::MomentumInferMeta(MakeMetaTensor(*input_param), + MakeMetaTensor(*input_grad), + MakeMetaTensor(*input_velocity), + MakeMetaTensor(*input_learning_rate), + input_meta_ref_master_param, + mu, + use_nesterov, + regularization_method, + regularization_coeff, + multi_precision, + rescale_grad, + &meta_out_0, + &meta_out_1, + &meta_out_2); + } else { + phi::MomentumInferMeta(MakeMetaTensor(*input_param), + MakeMetaTensor(*input_grad), + MakeMetaTensor(*input_velocity), + MakeMetaTensor(*input_learning_rate), + input_meta_ref_master_param, + mu, + use_nesterov, + regularization_method, + regularization_coeff, + multi_precision, + rescale_grad, + &meta_out_0, + &meta_out_1, + nullptr); + } + + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + paddle::optional, + float, + bool, + const std::string&, + float, + bool, + float, + phi::DenseTensor*, + phi::DenseTensor*, + phi::DenseTensor*); + auto* kernel_fn = kernel.GetVariadicKernelFn(); + + (*kernel_fn)(*dev_ctx, + *input_param, + *input_grad, + *input_velocity, + *input_learning_rate, + input_master_param, + mu, + use_nesterov, + regularization_method, + regularization_coeff, + multi_precision, + rescale_grad, + kernel_out_0, + kernel_out_1, + kernel_out_2); + + return api_output; +} + ////////////////// Backward(grad) api impls ////////////////////// // TODO(chenweihang): the original sum grad op can support higher-level diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 430eccdf430..15b593238cc 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/phi/common/int_array.h" #include "paddle/phi/common/place.h" #include "paddle/phi/common/scalar.h" +#include "paddle/utils/optional.h" namespace paddle { namespace experimental { @@ -33,6 +34,19 @@ std::vector split_impl(const Tensor& x, const IntArray& num_or_sections, const Scalar& axis); +std::tuple momentum_impl( + const Tensor& param, + const Tensor& grad, + const Tensor& velocity, + const Tensor& learning_rate, + paddle::optional master_param, + float mu, + bool use_nesterov, + const std::string& regularization_method, + float regularization_coeff, + bool multi_precision, + float rescale_grad); + ////////////////// Backward(grad) api impls ////////////////////// std::vector add_n_grad_impl(const std::vector& x, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 76951669c66..f2acfe5a996 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1504,6 +1504,53 @@ void MeshgridInferMeta(const std::vector& inputs, } } +void MomentumInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& velocity, + const MetaTensor& learning_rate, + paddle::optional master_param, + float mu, + bool use_nesterov, + const std::string& regularization_method, + float regularization_coeff, + bool multi_precision, + float rescale_grad, + MetaTensor* param_out, + MetaTensor* velocity_out, + MetaTensor* master_param_out) { + PADDLE_ENFORCE_NE( + param_out, + nullptr, + errors::NotFound("Output(ParamOut) of Momentum should not be null.")); + PADDLE_ENFORCE_NE( + velocity_out, + nullptr, + errors::NotFound("Output(VelocityOut) of Momentum should not be null.")); + + auto lr_dims = learning_rate.dims(); + PADDLE_ENFORCE_NE( + phi::product(lr_dims), + 0, + errors::InvalidArgument("Maybe the Input variable LearningRate has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); + PADDLE_ENFORCE_EQ( + phi::product(lr_dims), + 1, + errors::InvalidArgument("Learning_rate should be a scalar. But Received " + "LearningRate's dim [%s]", + phi::product(lr_dims))); + + auto param_dim = param.dims(); + param_out->set_dims(param_dim); + velocity_out->set_dims(param_dim); + + if (master_param_out) { + master_param_out->set_dims(param_dim); + } +} + void MultiDotInferMeta(const std::vector& x, MetaTensor* out) { auto inputs_dims = GetMetaTensorsDim(x); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index c63960c7b9b..c037641d082 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -230,6 +230,21 @@ void InterpolateInferMeta( void MeshgridInferMeta(const std::vector& inputs, std::vector outputs); +void MomentumInferMeta(const MetaTensor& param, + const MetaTensor& grad, + const MetaTensor& velocity, + const MetaTensor& learning_rate, + paddle::optional master_param, + float mu, + bool use_nesterov, + const std::string& regularization_method, + float regularization_coeff, + bool multi_precision, + float rescale_grad, + MetaTensor* param_out, + MetaTensor* velocity_out, + MetaTensor* master_param_out); + void MultiDotInferMeta(const std::vector& x, MetaTensor* out); void MultiplexInferMeta(const std::vector& ins, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index c489b362ccf..7dc0d0af68c 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -12806,8 +12806,10 @@ def mean(x, name=None): mean = fluid.layers.mean(input) """ - if _non_static_mode(): + if _in_legacy_dygraph(): return _C_ops.mean(x) + if in_dygraph_mode(): + return _C_ops.final_state_mean_all(x) helper = LayerHelper("mean", **locals()) check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mean') diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py index b20c2932f09..c5ee5c91e1c 100644 --- a/python/paddle/fluid/tests/unittests/test_mean_op.py +++ b/python/paddle/fluid/tests/unittests/test_mean_op.py @@ -21,7 +21,7 @@ import paddle import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid import Program, program_guard - +from paddle.fluid.framework import _test_eager_guard np.random.seed(10) @@ -40,7 +40,7 @@ def reduce_mean_wrapper(x, axis=0, keepdim=False, reduce_all=False): class TestMeanOp(OpTest): def setUp(self): self.op_type = "mean" - self.python_api = mean_wrapper + self.python_api = fluid.layers.mean self.dtype = np.float64 self.init_dtype_type() self.inputs = {'X': np.random.random((10, 10)).astype(self.dtype)} @@ -81,7 +81,7 @@ class TestFP16MeanOp(TestMeanOp): def test_check_output(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): - self.check_output_with_place(place) + self.check_output_with_place(place, check_eager=True) def test_checkout_grad(self): place = core.CUDAPlace(0) @@ -104,11 +104,11 @@ class TestBF16MeanOp(TestMeanOp): def test_check_output(self): paddle.enable_static() - self.check_output_with_place(core.CPUPlace()) + self.check_output_with_place(core.CPUPlace(), check_eager=True) def test_checkout_grad(self): place = core.CPUPlace() - self.check_grad_with_place(place, ['X'], 'Out') + self.check_grad_with_place(place, ['X'], 'Out', check_eager=True) def ref_reduce_mean(x, axis=None, keepdim=False, reduce_all=False): diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index 7f3690cff60..a4f38e37731 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -22,6 +22,7 @@ from op_test import OpTest import paddle import paddle.fluid as fluid import numpy +from paddle.fluid.framework import _test_eager_guard def calculate_momentum_by_numpy(param, @@ -528,6 +529,11 @@ class TestMomentumV2(unittest.TestCase): ValueError, paddle.optimizer.Momentum, learning_rate=None) self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None) + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_momentum_dygraph() + self.test_raise_error() + class TestMomentumOpWithDecay(OpTest): def setUp(self): @@ -921,6 +927,10 @@ class TestMultiTensorMomentumDygraph(unittest.TestCase): self._check_with_param_arrt(place, use_amp) self._check_with_param_group(place, use_amp) + def test_api_eager_dygraph(self): + with _test_eager_guard(): + self.test_main() + class TestMultiTensorMomentumStatic(unittest.TestCase): def _momentum_optimize_static(self, diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py index f68bbad4ab2..ce112c19250 100644 --- a/python/paddle/optimizer/momentum.py +++ b/python/paddle/optimizer/momentum.py @@ -25,6 +25,7 @@ import paddle.fluid as fluid from paddle.fluid.regularizer import L2DecayRegularizer from paddle import _C_ops import paddle +from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph __all__ = [] @@ -313,7 +314,7 @@ class Momentum(Optimizer): master_weight = (self._master_weights[param_and_grad[0].name] if find_master else None) - if framework._non_static_mode(): + if _in_legacy_dygraph(): if isinstance(param_and_grad, dict): self._update_regularization(param_and_grad['weight_decay']) _, _, _ = _C_ops.momentum( @@ -323,8 +324,15 @@ class Momentum(Optimizer): 'regularization_method', regularization_method, 'regularization_coeff', regularization_coeff, 'multi_precision', find_master) - return None + if in_dygraph_mode(): + if isinstance(param_and_grad, dict): + self._update_regularization(param_and_grad['weight_decay']) + return _C_ops.final_state_momentum( + param_and_grad[0], param_and_grad[1], velocity_acc, lr, + master_weight, self._momentum, self._use_nesterov, + regularization_method, regularization_coeff, find_master, + self._rescale_grad) attrs = { "mu": self._momentum, diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index a0c484f6562..13afa3fc868 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1120,6 +1120,15 @@ func : mean backward : mean_grad +- api : mean_all + args : (Tensor x) + output : Tensor + infer_meta : + func : MeanAllInferMeta + kernel : + func : mean_all + backward : mean_all_grad + - api : meshgrid args : (Tensor[] inputs) output : Tensor[] @@ -1172,6 +1181,12 @@ func : modulo backward : modulo_grad +- api : momentum + args : (Tensor param, Tensor grad, Tensor velocity, Tensor learning_rate, Tensor master_param, float mu, bool use_nesterov = false, str regularization_method = "", float regularization_coeff = 0.0, bool multi_precision = false, float rescale_grad = 1.0f) + output : Tensor(param_out), Tensor(velocity_out), Tensor(master_param_out) + invoke : momentum_impl(param, grad, velocity, learning_rate, master_param, mu, use_nesterov, regularization_method, regularization_coeff, multi_precision, rescale_grad) + optional : master_param + # multinomial - api : multinomial args : (Tensor x, int num_samples, bool replacement) diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index e268675bdcf..632636dea6d 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -779,6 +779,16 @@ kernel : func : maximum_grad +- backward_api : mean_all_grad + forward : mean_all(Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param: [x] + kernel : + func : mean_all_grad + - backward_api : mean_grad forward: mean (Tensor x, int64_t[] dims={}, bool keep_dim=false) -> Tensor(out) args : (Tensor x, Tensor out_grad, int64_t[] dims={}, bool keep_dim=false, bool reduce_all=false) -- GitLab