From 01bfe7863395b147fcee69efdc047a819f3c4631 Mon Sep 17 00:00:00 2001 From: WangZhen <23097963+0x45f@users.noreply.github.com> Date: Mon, 7 Nov 2022 11:26:43 +0800 Subject: [PATCH] Get three grad lists in CPP to avoid gpu idle time (#47665) * Get three grad lists in CPP to avoid gpu idle time * Support legacy mode --- paddle/fluid/pybind/eager_functions.cc | 39 +++++++++++++++ paddle/fluid/pybind/eager_utils.cc | 11 +++++ paddle/fluid/pybind/eager_utils.h | 2 + .../paddle/fluid/dygraph/amp/loss_scaler.py | 48 +++++++++++-------- 4 files changed, 80 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index db0da96493..b1ace9c764 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -190,6 +190,41 @@ static PyObject* eager_api_tensor_copy(PyObject* self, EAGER_CATCH_AND_THROW_RETURN_NULL } +PyObject* eager_api_get_grads_lists(PyObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); + + std::vector> ret(3); + + for (auto& tensor : tensor_list) { + VLOG(6) << "Get grad for tensor: " << tensor.name(); + auto meta = egr::EagerUtils::nullable_autograd_meta(tensor); + VLOG(6) << meta << " initialized: " << meta->Grad().initialized(); + if (meta && meta->Grad().initialized()) { + auto& grad = meta->Grad(); + switch (grad.dtype()) { + case paddle::experimental::DataType::FLOAT16: + ret[0].emplace_back(grad); + break; + case paddle::experimental::DataType::BFLOAT16: + ret[1].emplace_back(grad); + break; + case paddle::experimental::DataType::FLOAT32: + ret[2].emplace_back(grad); + break; + default: + break; + } + } + } + + return ToPyObject(ret); + + EAGER_CATCH_AND_THROW_RETURN_NULL +} + static PyObject* eager_api_read_next_tensor_list(PyObject* self, PyObject* args, PyObject* kwargs) { @@ -1001,6 +1036,10 @@ PyMethodDef variable_functions[] = { (PyCFunction)(void (*)(void))eager_api_tensor_copy, METH_VARARGS | METH_KEYWORDS, NULL}, + {"get_grads_lists", + (PyCFunction)(void (*)(void))eager_api_get_grads_lists, + METH_VARARGS | METH_KEYWORDS, + NULL}, {"read_next_tensor_list", (PyCFunction)(void (*)(void))eager_api_read_next_tensor_list, METH_VARARGS | METH_KEYWORDS, diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 895a715c8b..1237e4092f 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -720,6 +720,17 @@ PyObject* ToPyObject(const std::vector& value, return result; } +PyObject* ToPyObject( + const std::vector>& value) { + PyObject* result = PyList_New((Py_ssize_t)value.size()); + + for (size_t i = 0; i < value.size(); i++) { + PyList_SET_ITEM(result, static_cast(i), ToPyObject(value[i])); + } + + return result; +} + PyObject* ToPyObject(const platform::Place& value) { auto obj = ::pybind11::cast(value); obj.inc_ref(); diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index 845b681163..145eeacc04 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -103,6 +103,8 @@ PyObject* ToPyObject(const std::vector& value); PyObject* ToPyObject(const std::vector>& value); PyObject* ToPyObject(const std::vector& value, bool return_py_none_if_not_initialize = false); +PyObject* ToPyObject( + const std::vector>& value); PyObject* ToPyObject(const platform::Place& value); PyObject* ToPyObject(const phi::DenseTensor* value); PyObject* ToPyObject(const phi::SelectedRows* value); diff --git a/python/paddle/fluid/dygraph/amp/loss_scaler.py b/python/paddle/fluid/dygraph/amp/loss_scaler.py index 6ab153e4a2..c59588e9d0 100644 --- a/python/paddle/fluid/dygraph/amp/loss_scaler.py +++ b/python/paddle/fluid/dygraph/amp/loss_scaler.py @@ -26,6 +26,7 @@ import numpy as np from paddle import _C_ops, _legacy_C_ops from collections import defaultdict from enum import Enum +from paddle.fluid import in_dygraph_mode __all__ = ['AmpScaler', 'OptimizerState'] @@ -297,26 +298,33 @@ class AmpScaler(object): else: param_grads_fp32.append(param._grad_ivar()) else: - param_grads = [ - param._grad_ivar() - for param in optimizer._parameter_list - if param._grad_ivar() is not None - ] - param_grads_fp16 = [ - param - for param in param_grads - if param.dtype == core.VarDesc.VarType.FP16 - ] - param_grads_bf16 = [ - param - for param in param_grads - if param.dtype == core.VarDesc.VarType.BF16 - ] - param_grads_fp32 = [ - param - for param in param_grads - if param.dtype == core.VarDesc.VarType.FP32 - ] + if in_dygraph_mode(): + ( + param_grads_fp16, + param_grads_bf16, + param_grads_fp32, + ) = core.eager.get_grads_lists(optimizer._parameter_list) + else: + param_grads = [ + param._grad_ivar() + for param in optimizer._parameter_list + if param._grad_ivar() is not None + ] + param_grads_fp16 = [ + param + for param in param_grads + if param.dtype == core.VarDesc.VarType.FP16 + ] + param_grads_bf16 = [ + param + for param in param_grads + if param.dtype == core.VarDesc.VarType.BF16 + ] + param_grads_fp32 = [ + param + for param in param_grads + if param.dtype == core.VarDesc.VarType.FP32 + ] if core.is_compiled_with_npu(): float_status = _legacy_C_ops.alloc_float_status() _legacy_C_ops.clear_float_status(float_status, float_status) -- GitLab