From 261ebb0c14c4218430d947bfc107b21fbe33d4b9 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Wed, 9 Nov 2022 17:57:43 +0800 Subject: [PATCH] Get grads from cpp for optimizer to avoid gpu idel time (#47709) * Get params and grads in cpp to avoid gpu idel time * Using python param instead of cpp return param to fix test_asp_optimize_dynamic.py * Get grads from cpp and construct params_grads on python * Check meta and remove comments --- paddle/fluid/pybind/eager_functions.cc | 31 +++++++++++++++++-- .../paddle/fluid/dygraph/amp/loss_scaler.py | 4 +++ python/paddle/optimizer/optimizer.py | 27 +++++++++++----- 3 files changed, 52 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index b1ace9c7641..757b5453ec2 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -190,18 +190,41 @@ static PyObject* eager_api_tensor_copy(PyObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } +PyObject* eager_api_get_all_grads(PyObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); + + std::vector ret; + for (auto& tensor : tensor_list) { + VLOG(6) << "Get grad for tensor: " << tensor.name(); + auto meta = egr::EagerUtils::nullable_autograd_meta(tensor); + if (!meta || meta->StopGradient()) { + ret.emplace_back(paddle::experimental::Tensor()); + continue; + } + if (meta && meta->Grad().initialized()) { + ret.emplace_back(meta->Grad()); + } else { + ret.emplace_back(paddle::experimental::Tensor()); + } + } + return ToPyObject(ret, true); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyObject* eager_api_get_grads_lists(PyObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); - + // The order of the 3 vectors is: FP16_grads, BF16_grads, FP32_grads std::vector> ret(3); for (auto& tensor : tensor_list) { VLOG(6) << "Get grad for tensor: " << tensor.name(); auto meta = egr::EagerUtils::nullable_autograd_meta(tensor); - VLOG(6) << meta << " initialized: " << meta->Grad().initialized(); if (meta && meta->Grad().initialized()) { auto& grad = meta->Grad(); switch (grad.dtype()) { @@ -1036,6 +1059,10 @@ PyMethodDef variable_functions[] = { (PyCFunction)(void (*)(void))eager_api_tensor_copy, METH_VARARGS | METH_KEYWORDS, NULL}, + {"get_all_grads", + (PyCFunction)(void (*)(void))eager_api_get_all_grads, + METH_VARARGS | METH_KEYWORDS, + NULL}, {"get_grads_lists", (PyCFunction)(void (*)(void))eager_api_get_grads_lists, METH_VARARGS | METH_KEYWORDS, diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 0985237f516..3ecdd7019b1 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -299,12 +299,16 @@ class AmpScaler: param_grads_fp32.append(param._grad_ivar()) else: if in_dygraph_mode(): + # It is very time-consuming to call c++ functions in a loop on the python side. + # We put this part of the code on the c++ side to improve the speed in eager mode. ( param_grads_fp16, param_grads_bf16, param_grads_fp32, ) = core.eager.get_grads_lists(optimizer._parameter_list) else: + # Keep the original code to support legacy mode. + # Delete the else branch when the legacy mode exits. param_grads = [ param._grad_ivar() for param in optimizer._parameter_list diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 36aa9c151d7..26ae5b50269 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -1028,14 +1028,25 @@ class Optimizer: if framework._non_static_mode(): parameter_list = parameters if parameters else self._parameter_list - params_grads = [] - for param in parameter_list: - if param.stop_gradient: - continue - if param._grad_ivar() is not None: - # create gradient tensor - grad_var = param._grad_ivar() - params_grads.append((param, grad_var)) + if framework.in_dygraph_mode(): + # It is very time-consuming to call c++ functions in a loop on the python side. + # We put this part of the code on the c++ side to improve the speed in eager mode. + params_grads = [] + grads = core.eager.get_all_grads(parameter_list) + for index, grad in enumerate(grads): + if grad is not None: + params_grads.append((parameter_list[index], grad)) + else: + # Keep the original code to support legacy mode. + # Delete the else branch when the legacy mode exits. + params_grads = [] + for param in parameter_list: + if param.stop_gradient: + continue + if param._grad_ivar() is not None: + # create gradient tensor + grad_var = param._grad_ivar() + params_grads.append((param, grad_var)) else: if callbacks is None: callbacks = [error_clip_callback] -- GitLab