Get grads from cpp for optimizer to avoid gpu idel time (#47709)

* Get params and grads in cpp to avoid gpu idel time * Using python param instead of cpp return param to fix test_asp_optimize_dynamic.py * Get grads from cpp and construct params_grads on python * Check meta and remove comments

Get grads from cpp for optimizer to avoid gpu idel time (#47709)
* Get params and grads in cpp to avoid gpu idel time * Using python param instead of cpp return param to fix test_asp_optimize_dynamic.py * Get grads from cpp and construct params_grads on python * Check meta and remove comments
261ebb0c · WangZhen · GitHub · 1631836f · 261ebb0c · 261ebb0c
3 changed file
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -190,18 +190,41 @@ static PyObject* eager_api_tensor_copy(PyObject* self,
  EAGER_CATCH_AND_THROW_RETURN_NULL
 }
+PyObject* eager_api_get_all_grads(PyObject* self,
+                                  PyObject* args,
+                                  PyObject* kwargs) {
+  EAGER_TRY
+  auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
+  std::vector<paddle::experimental::Tensor> ret;
+  for (auto& tensor : tensor_list) {
+    VLOG(6) << "Get grad for tensor: " << tensor.name();
+    auto meta = egr::EagerUtils::nullable_autograd_meta(tensor);
+    if (!meta || meta->StopGradient()) {
+      ret.emplace_back(paddle::experimental::Tensor());
+      continue;
+    }
+    if (meta && meta->Grad().initialized()) {
+      ret.emplace_back(meta->Grad());
+    } else {
+      ret.emplace_back(paddle::experimental::Tensor());
+    }
+  }
+  return ToPyObject(ret, true);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
 PyObject* eager_api_get_grads_lists(PyObject* self,
                                    PyObject* args,
                                    PyObject* kwargs) {
  EAGER_TRY
  auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
+  // The order of the 3 vectors is: FP16_grads, BF16_grads, FP32_grads
  std::vector<std::vector<paddle::experimental::Tensor>> ret(3);
  for (auto& tensor : tensor_list) {
    VLOG(6) << "Get grad for tensor: " << tensor.name();
    auto meta = egr::EagerUtils::nullable_autograd_meta(tensor);
-    VLOG(6) << meta << " initialized: " << meta->Grad().initialized();
    if (meta && meta->Grad().initialized()) {
      auto& grad = meta->Grad();
      switch (grad.dtype()) {
@@ -1036,6 +1059,10 @@ PyMethodDef variable_functions[] = {
     (PyCFunction)(void (*)(void))eager_api_tensor_copy,
     METH_VARARGS | METH_KEYWORDS,
     NULL},
+    {"get_all_grads",
+     (PyCFunction)(void (*)(void))eager_api_get_all_grads,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
    {"get_grads_lists",
     (PyCFunction)(void (*)(void))eager_api_get_grads_lists,
     METH_VARARGS | METH_KEYWORDS,

--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -299,12 +299,16 @@ class AmpScaler:
                            param_grads_fp32.append(param._grad_ivar())
        else:
            if in_dygraph_mode():
+                # It is very time-consuming to call c++ functions in a loop on the python side.
+                # We put this part of the code on the c++ side to improve the speed in eager mode.
                (
                    param_grads_fp16,
                    param_grads_bf16,
                    param_grads_fp32,
                ) = core.eager.get_grads_lists(optimizer._parameter_list)
            else:
+                # Keep the original code to support legacy mode.
+                # Delete the else branch when the legacy mode exits.
                param_grads = [
                    param._grad_ivar()
                    for param in optimizer._parameter_list

--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -1028,14 +1028,25 @@ class Optimizer:
        if framework._non_static_mode():
            parameter_list = parameters if parameters else self._parameter_list
-            params_grads = []
+            if framework.in_dygraph_mode():
-            for param in parameter_list:
+                # It is very time-consuming to call c++ functions in a loop on the python side.
-                if param.stop_gradient:
+                # We put this part of the code on the c++ side to improve the speed in eager mode.
-                    continue
+                params_grads = []
-                if param._grad_ivar() is not None:
+                grads = core.eager.get_all_grads(parameter_list)
-                    # create gradient tensor
+                for index, grad in enumerate(grads):
-                    grad_var = param._grad_ivar()
+                    if grad is not None:
-                    params_grads.append((param, grad_var))
+                        params_grads.append((parameter_list[index], grad))
+            else:
+                # Keep the original code to support legacy mode.
+                # Delete the else branch when the legacy mode exits.
+                params_grads = []
+                for param in parameter_list:
+                    if param.stop_gradient:
+                        continue
+                    if param._grad_ivar() is not None:
+                        # create gradient tensor
+                        grad_var = param._grad_ivar()
+                        params_grads.append((param, grad_var))
        else:
            if callbacks is None:
                callbacks = [error_clip_callback]