From 261ebb0c14c4218430d947bfc107b21fbe33d4b9 Mon Sep 17 00:00:00 2001
From: WangZhen <23097963+0x45f@users.noreply.github.com>
Date: Wed, 9 Nov 2022 17:57:43 +0800
Subject: [PATCH] Get grads from cpp for optimizer to avoid gpu idel time
 (#47709)

* Get params and grads in cpp to avoid gpu idel time

* Using python param instead of cpp return param to fix test_asp_optimize_dynamic.py

* Get grads from cpp and construct params_grads on python

* Check meta and remove comments
---
 paddle/fluid/pybind/eager_functions.cc        | 31 +++++++++++++++++--
 .../paddle/fluid/dygraph/amp/loss_scaler.py   |  4 +++
 python/paddle/optimizer/optimizer.py          | 27 +++++++++++-----
 3 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index b1ace9c764..757b5453ec 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -190,18 +190,41 @@ static PyObject* eager_api_tensor_copy(PyObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+PyObject* eager_api_get_all_grads(PyObject* self,
+                                  PyObject* args,
+                                  PyObject* kwargs) {
+  EAGER_TRY
+  auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
+
+  std::vector<paddle::experimental::Tensor> ret;
+  for (auto& tensor : tensor_list) {
+    VLOG(6) << "Get grad for tensor: " << tensor.name();
+    auto meta = egr::EagerUtils::nullable_autograd_meta(tensor);
+    if (!meta || meta->StopGradient()) {
+      ret.emplace_back(paddle::experimental::Tensor());
+      continue;
+    }
+    if (meta && meta->Grad().initialized()) {
+      ret.emplace_back(meta->Grad());
+    } else {
+      ret.emplace_back(paddle::experimental::Tensor());
+    }
+  }
+  return ToPyObject(ret, true);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 PyObject* eager_api_get_grads_lists(PyObject* self,
                                     PyObject* args,
                                     PyObject* kwargs) {
   EAGER_TRY
   auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
-
+  // The order of the 3 vectors is: FP16_grads, BF16_grads, FP32_grads
   std::vector<std::vector<paddle::experimental::Tensor>> ret(3);
 
   for (auto& tensor : tensor_list) {
     VLOG(6) << "Get grad for tensor: " << tensor.name();
     auto meta = egr::EagerUtils::nullable_autograd_meta(tensor);
-    VLOG(6) << meta << " initialized: " << meta->Grad().initialized();
     if (meta && meta->Grad().initialized()) {
       auto& grad = meta->Grad();
       switch (grad.dtype()) {
@@ -1036,6 +1059,10 @@ PyMethodDef variable_functions[] = {
      (PyCFunction)(void (*)(void))eager_api_tensor_copy,
      METH_VARARGS | METH_KEYWORDS,
      NULL},
+    {"get_all_grads",
+     (PyCFunction)(void (*)(void))eager_api_get_all_grads,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
     {"get_grads_lists",
      (PyCFunction)(void (*)(void))eager_api_get_grads_lists,
      METH_VARARGS | METH_KEYWORDS,
diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py
index 0985237f51..3ecdd7019b 100644
--- a/python/paddle/fluid/dygraph/amp/loss_scaler.py
+++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py
@@ -299,12 +299,16 @@ class AmpScaler:
                             param_grads_fp32.append(param._grad_ivar())
         else:
             if in_dygraph_mode():
+                # It is very time-consuming to call c++ functions in a loop on the python side.
+                # We put this part of the code on the c++ side to improve the speed in eager mode.
                 (
                     param_grads_fp16,
                     param_grads_bf16,
                     param_grads_fp32,
                 ) = core.eager.get_grads_lists(optimizer._parameter_list)
             else:
+                # Keep the original code to support legacy mode.
+                # Delete the else branch when the legacy mode exits.
                 param_grads = [
                     param._grad_ivar()
                     for param in optimizer._parameter_list
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 36aa9c151d..26ae5b5026 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -1028,14 +1028,25 @@ class Optimizer:
         if framework._non_static_mode():
             parameter_list = parameters if parameters else self._parameter_list
 
-            params_grads = []
-            for param in parameter_list:
-                if param.stop_gradient:
-                    continue
-                if param._grad_ivar() is not None:
-                    # create gradient tensor
-                    grad_var = param._grad_ivar()
-                    params_grads.append((param, grad_var))
+            if framework.in_dygraph_mode():
+                # It is very time-consuming to call c++ functions in a loop on the python side.
+                # We put this part of the code on the c++ side to improve the speed in eager mode.
+                params_grads = []
+                grads = core.eager.get_all_grads(parameter_list)
+                for index, grad in enumerate(grads):
+                    if grad is not None:
+                        params_grads.append((parameter_list[index], grad))
+            else:
+                # Keep the original code to support legacy mode.
+                # Delete the else branch when the legacy mode exits.
+                params_grads = []
+                for param in parameter_list:
+                    if param.stop_gradient:
+                        continue
+                    if param._grad_ivar() is not None:
+                        # create gradient tensor
+                        grad_var = param._grad_ivar()
+                        params_grads.append((param, grad_var))
         else:
             if callbacks is None:
                 callbacks = [error_clip_callback]
-- 
GitLab