diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index f51d776d7195c5fcb8b013dedffb3d7f730a7c6e..a2af131cb505e4d0cb499f304382949c92c8ad99 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -126,7 +126,7 @@ $$param\_out = param - learning\_rate * grad$$ namespace ops = paddle::operators; DECLARE_INFER_SHAPE_FUNCTOR(sgd, SGDInferShapeFunctor, - PD_INFER_META(phi::SGDInferMeta)); + PD_INFER_META(phi::SgdInferMeta)); REGISTER_OPERATOR( sgd, ops::SGDOp, ops::SGDOpMaker, paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index d7f148fff818b3c9ddf49ad90821ea812d8705c5..2b80094a39e31646139e7a5312f933cf9a55cf6a 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -656,6 +656,176 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl( return api_output; } +std::tuple<Tensor, Tensor> sgd_impl( + const Tensor& param, + const Tensor& learning_rate, + const Tensor& grad, + paddle::optional<const Tensor&> master_param, + bool multi_precision) { + DataType kernel_data_type = ParseDataType(param); + auto kernel_key_set = ParseKernelKeyByInputArgs(param, learning_rate, grad); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); + VLOG(6) << "sgd API kernel key: [" << kernel_key.backend() << ", " + << kernel_key.layout() << ", " << kernel_data_type << "]"; + + const auto& param_tensor = param.impl(); + std::string kernel_name = "sgd"; + if (phi::DenseTensor::classof(param_tensor.get())) { + if (!phi::DenseTensor::classof(grad.impl().get())) { + kernel_name = "sgd_dense_param_sparse_grad"; + } + } else { + kernel_name = "sgd_sparse_param_sparse_grad"; + } + const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( + kernel_name, + {kernel_key.backend(), kernel_key.layout(), kernel_data_type}); + VLOG(6) << kernel_name << " API kernel: " << kernel; + + auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + + auto in_learning_rate = + PrepareData(learning_rate, kernel.InputAt(1), {false, true, true, true}); + + std::tuple<Tensor, Tensor> out; + std::get<0>(out) = param; + if (master_param) { + std::get<1>(out) = *master_param; + } + phi::MetaTensor meta_out_0(std::get<0>(out).impl().get()); + phi::MetaTensor meta_out_1(master_param ? std::get<1>(out).impl().get() + : nullptr); + + if (phi::DenseTensor::classof(param_tensor.get())) { + auto in_param = PrepareData(param, kernel.InputAt(0), {}); + auto in_master_param = PrepareData(master_param, kernel.InputAt(3), {}); + + paddle::optional<const phi::DenseTensor&> in_master_param_opt = + master_param + ? paddle::make_optional<const phi::DenseTensor&>(*in_master_param) + : paddle::none; + auto master_param_meta = MakeMetaTensor(in_master_param_opt); + paddle::optional<const phi::MetaTensor&> master_param_meta_opt = + master_param + ? paddle::make_optional<const phi::MetaTensor&>(*master_param_meta) + : paddle::none; + + phi::DenseTensor* kernel_out_0 = + SetKernelOutput(kernel_key.backend(), &std::get<0>(out)); + phi::DenseTensor* kernel_out_1 = + master_param + ? static_cast<phi::DenseTensor*>(std::get<1>(out).impl().get()) + : nullptr; + + if (phi::DenseTensor::classof(grad.impl().get())) { + auto in_grad = PrepareData(grad, kernel.InputAt(2), {}); + SgdInferMeta(MakeMetaTensor(*in_param), + MakeMetaTensor(*in_learning_rate), + MakeMetaTensor(*in_grad), + master_param_meta_opt, + multi_precision, + &meta_out_0, + &meta_out_1); + + using kernel_signature = + void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::DenseTensor&, + paddle::optional<const phi::DenseTensor&>, + bool, + phi::DenseTensor*, + phi::DenseTensor*); + + auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>(); + (*kernel_fn)(*dev_ctx, + *in_param, + *in_learning_rate, + *in_grad, + in_master_param_opt, + multi_precision, + kernel_out_0, + kernel_out_1); + } else { + auto in_grad = TensorToSelectedRows(grad); + SgdInferMeta(MakeMetaTensor(*in_param), + MakeMetaTensor(*in_learning_rate), + MakeMetaTensor(*in_grad), + master_param_meta_opt, + multi_precision, + &meta_out_0, + &meta_out_1); + + using kernel_signature = + void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + const phi::DenseTensor&, + const phi::SelectedRows&, + paddle::optional<const phi::DenseTensor&>, + bool, + phi::DenseTensor*, + phi::DenseTensor*); + auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>(); + (*kernel_fn)(*dev_ctx, + *in_param, + *in_learning_rate, + *in_grad, + in_master_param_opt, + multi_precision, + kernel_out_0, + kernel_out_1); + } + } else { + auto in_param = TensorToSelectedRows(param); + auto in_grad = TensorToSelectedRows(grad); + auto in_master_param = TensorToSelectedRows(master_param); + auto in_master_param_opt = + master_param + ? paddle::make_optional<const phi::SelectedRows&>(*in_master_param) + : paddle::none; + auto master_param_meta = MakeMetaTensor(in_master_param_opt); + paddle::optional<const phi::MetaTensor&> master_param_meta_opt = + master_param + ? paddle::make_optional<const phi::MetaTensor&>(*master_param_meta) + : paddle::none; + + phi::SelectedRows* kernel_out_0 = + SetSelectedRowsKernelOutput(kernel_key.backend(), &std::get<0>(out)); + phi::SelectedRows* kernel_out_1 = + master_param + ? static_cast<phi::SelectedRows*>(std::get<1>(out).impl().get()) + : nullptr; + + SgdInferMeta(MakeMetaTensor(*in_param), + MakeMetaTensor(*in_learning_rate), + MakeMetaTensor(*in_grad), + master_param_meta_opt, + multi_precision, + &meta_out_0, + &meta_out_1); + + using kernel_signature = + void (*)(const platform::DeviceContext&, + const phi::SelectedRows&, + const phi::DenseTensor&, + const phi::SelectedRows&, + paddle::optional<const phi::SelectedRows&>, + bool, + phi::SelectedRows*, + phi::SelectedRows*); + auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>(); + (*kernel_fn)(*dev_ctx, + *in_param, + *in_learning_rate, + *in_grad, + in_master_param_opt, + multi_precision, + kernel_out_0, + kernel_out_1); + } + return out; +} + ////////////////// Backward(grad) api impls ////////////////////// // TODO(chenweihang): the original sum grad op can support higher-level diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 5d46ed691816b71418ac62ec1726d590f342c0de..4ddc3e5f4e0d2edda8864960b79dc8eb22de48ff 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include <tuple> #include <vector> #include "paddle/phi/api/include/tensor.h" @@ -107,6 +108,13 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl( bool multi_precision, float rescale_grad); +std::tuple<Tensor, Tensor> sgd_impl( + const Tensor& param, + const Tensor& learning_rate, + const Tensor& grad, + paddle::optional<const Tensor&> master_param, + bool multi_precision); + ////////////////// Backward(grad) api impls ////////////////////// std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x, diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc index f9db152956923a66c1e88af4841d59e834bcb726..e0c910ba3d66c9c8e61265d12e60da4dd252d035 100644 --- a/paddle/phi/api/lib/api_gen_utils.cc +++ b/paddle/phi/api/lib/api_gen_utils.cc @@ -20,13 +20,13 @@ namespace experimental { /* ------------------ for input ----------------------- */ std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor) { - return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl()); + return std::static_pointer_cast<phi::DenseTensor>(tensor.impl()); } std::shared_ptr<phi::DenseTensor> TensorToDenseTensor( - const paddle::optional<Tensor>& tensor) { + const paddle::optional<const Tensor&>& tensor) { if (tensor) { - return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl()); + return std::static_pointer_cast<phi::DenseTensor>(tensor->impl()); } return nullptr; } @@ -45,13 +45,13 @@ std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor( } std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor) { - return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl()); + return std::static_pointer_cast<phi::SelectedRows>(tensor.impl()); } std::shared_ptr<phi::SelectedRows> TensorToSelectedRows( - const paddle::optional<Tensor>& tensor) { + const paddle::optional<const Tensor&>& tensor) { if (tensor) { - return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl()); + return std::static_pointer_cast<phi::SelectedRows>(tensor->impl()); } return nullptr; } @@ -66,6 +66,14 @@ phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) { return phi::MetaTensor(tensor); } +paddle::optional<phi::MetaTensor> MakeMetaTensor( + const paddle::optional<const phi::DenseTensor&>& tensor) { + if (tensor) { + return {phi::MetaTensor(*tensor)}; + } + return {paddle::none}; +} + std::vector<phi::MetaTensor> MakeMetaTensor( const std::vector<const phi::DenseTensor*>& tensors) { std::vector<phi::MetaTensor> meta_tensors; @@ -90,6 +98,14 @@ phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) { return phi::MetaTensor(tensor); } +paddle::optional<phi::MetaTensor> MakeMetaTensor( + const paddle::optional<const phi::SelectedRows&>& tensor) { + if (tensor) { + return {phi::MetaTensor(*tensor)}; + } + return {paddle::none}; +} + phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) { return phi::MetaTensor(tensor); } diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h index 035dfc5204720714346a260fc60db1362e542a85..47b80bb3fc290dbba2abade53a1866a557c174a6 100644 --- a/paddle/phi/api/lib/api_gen_utils.h +++ b/paddle/phi/api/lib/api_gen_utils.h @@ -42,7 +42,7 @@ std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor( std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor); std::shared_ptr<phi::SelectedRows> TensorToSelectedRows( - const paddle::optional<Tensor>& tensor); + const paddle::optional<const Tensor&>& tensor); std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor); @@ -50,6 +50,9 @@ std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor); phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor); +paddle::optional<phi::MetaTensor> MakeMetaTensor( + const paddle::optional<const phi::DenseTensor&>& tensor); + std::vector<phi::MetaTensor> MakeMetaTensor( const std::vector<const phi::DenseTensor*>& tensors); @@ -58,6 +61,9 @@ std::vector<phi::MetaTensor> MakeMetaTensor( phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor); +paddle::optional<phi::MetaTensor> MakeMetaTensor( + const paddle::optional<const phi::SelectedRows&>& tensor); + phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor); /* ------------------ for output ----------------------- */ diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index f2acfe5a9962be97fd385f322e5136986ad78a28..5fecd3740e9307182479e51b77556902dc2354ed 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -1887,7 +1887,7 @@ void RnnInferMeta(const MetaTensor& x, } } -void SGDInferMeta(const MetaTensor& param, +void SgdInferMeta(const MetaTensor& param, const MetaTensor& learning_rate, const MetaTensor& grad, paddle::optional<const MetaTensor&> master_param, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index c037641d082b759760035a6d33ccf2ecece1193e..9137b574ac09d19d3eed0f92d82509bf600795fa 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -292,7 +292,7 @@ void RnnInferMeta(const MetaTensor& x, std::vector<MetaTensor*> state, MetaTensor* reserve); -void SGDInferMeta(const MetaTensor& param, +void SgdInferMeta(const MetaTensor& param, const MetaTensor& learning_rate, const MetaTensor& grad, paddle::optional<const MetaTensor&> master_param, diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 8242d8e3392ec02ebb0f335b099d24eebd9fff06..95db9d39c1ec495df238175288c0d26e2f476bfb 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -44,6 +44,7 @@ from .wrapped_decorator import signature_safe_contextmanager from .. import compat as cpt import warnings from paddle import _C_ops +from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode __all__ = [ 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', @@ -1370,7 +1371,11 @@ class SGDOptimizer(Optimizer): if find_master else None) lr = self._create_param_lr(param_and_grad) - if framework._non_static_mode(): + if in_dygraph_mode(): + _C_ops.final_state_sgd(param_and_grad[0], lr, param_and_grad[1], + master_weight, find_master) + return None + if _in_legacy_dygraph(): _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight, param_and_grad[0], master_weight) return None diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py index 817150a21f5e56e53d41d485d316802ec6983d8a..ad03fa30009e7a0ebcdc175388127008b517563a 100644 --- a/python/paddle/fluid/tests/unittests/test_sgd_op.py +++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py @@ -21,6 +21,7 @@ import paddle.fluid.core as core from paddle.fluid.op import Operator from op_test import OpTest import paddle +from paddle.fluid.framework import _test_eager_guard paddle.enable_static() @@ -291,6 +292,11 @@ class TestSGDV2(unittest.TestCase): adam.step() adam.clear_gradients() + def test_eager(self): + with _test_eager_guard(): + self.test_sgd_dygraph() + self.test_sgd_group_dygraph() + class TestSGDMultiPrecision2_0(unittest.TestCase): def dygraph_sgd_mp(self, mp): diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py index fdee57bb1253e0683434faabd17fae6632ad5b72..46dd0b73a5eb8b1f4b63043f489cb84af7040bd7 100644 --- a/python/paddle/optimizer/sgd.py +++ b/python/paddle/optimizer/sgd.py @@ -22,6 +22,7 @@ import warnings from ..fluid.layer_helper import LayerHelper from ..fluid import unique_name from ..fluid import layers +from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode __all__ = [] @@ -144,7 +145,11 @@ class SGD(Optimizer): if find_master else None) lr = self._create_param_lr(param_and_grad) - if framework._non_static_mode(): + if in_dygraph_mode(): + _C_ops.final_state_sgd(param_and_grad[0], lr, param_and_grad[1], + master_weight, find_master) + return None + if _in_legacy_dygraph(): _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight, param_and_grad[0], master_weight) return None diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 329882317ee2bc2b61df83ae178e3f0f69a0d649..b4abe5b303b8e6425ebbbe17931e8a6d1db7da16 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -1794,6 +1794,12 @@ func : selu backward : selu_grad +- api : sgd + args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision) + output : Tensor(param_out), Tensor(master_param_out) + invoke : sgd_impl(param, learning_rate, grad, master_param, multi_precision) + optional : master_param + - api : shape args : (Tensor input) output : Tensor