未验证 提交 261ebb0c 编写于 作者: W WangZhen 提交者: GitHub

Get grads from cpp for optimizer to avoid gpu idel time (#47709)

* Get params and grads in cpp to avoid gpu idel time

* Using python param instead of cpp return param to fix test_asp_optimize_dynamic.py

* Get grads from cpp and construct params_grads on python

* Check meta and remove comments
上级 1631836f
...@@ -190,18 +190,41 @@ static PyObject* eager_api_tensor_copy(PyObject* self, ...@@ -190,18 +190,41 @@ static PyObject* eager_api_tensor_copy(PyObject* self,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
PyObject* eager_api_get_all_grads(PyObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
std::vector<paddle::experimental::Tensor> ret;
for (auto& tensor : tensor_list) {
VLOG(6) << "Get grad for tensor: " << tensor.name();
auto meta = egr::EagerUtils::nullable_autograd_meta(tensor);
if (!meta || meta->StopGradient()) {
ret.emplace_back(paddle::experimental::Tensor());
continue;
}
if (meta && meta->Grad().initialized()) {
ret.emplace_back(meta->Grad());
} else {
ret.emplace_back(paddle::experimental::Tensor());
}
}
return ToPyObject(ret, true);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyObject* eager_api_get_grads_lists(PyObject* self, PyObject* eager_api_get_grads_lists(PyObject* self,
PyObject* args, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
// The order of the 3 vectors is: FP16_grads, BF16_grads, FP32_grads
std::vector<std::vector<paddle::experimental::Tensor>> ret(3); std::vector<std::vector<paddle::experimental::Tensor>> ret(3);
for (auto& tensor : tensor_list) { for (auto& tensor : tensor_list) {
VLOG(6) << "Get grad for tensor: " << tensor.name(); VLOG(6) << "Get grad for tensor: " << tensor.name();
auto meta = egr::EagerUtils::nullable_autograd_meta(tensor); auto meta = egr::EagerUtils::nullable_autograd_meta(tensor);
VLOG(6) << meta << " initialized: " << meta->Grad().initialized();
if (meta && meta->Grad().initialized()) { if (meta && meta->Grad().initialized()) {
auto& grad = meta->Grad(); auto& grad = meta->Grad();
switch (grad.dtype()) { switch (grad.dtype()) {
...@@ -1036,6 +1059,10 @@ PyMethodDef variable_functions[] = { ...@@ -1036,6 +1059,10 @@ PyMethodDef variable_functions[] = {
(PyCFunction)(void (*)(void))eager_api_tensor_copy, (PyCFunction)(void (*)(void))eager_api_tensor_copy,
METH_VARARGS | METH_KEYWORDS, METH_VARARGS | METH_KEYWORDS,
NULL}, NULL},
{"get_all_grads",
(PyCFunction)(void (*)(void))eager_api_get_all_grads,
METH_VARARGS | METH_KEYWORDS,
NULL},
{"get_grads_lists", {"get_grads_lists",
(PyCFunction)(void (*)(void))eager_api_get_grads_lists, (PyCFunction)(void (*)(void))eager_api_get_grads_lists,
METH_VARARGS | METH_KEYWORDS, METH_VARARGS | METH_KEYWORDS,
......
...@@ -299,12 +299,16 @@ class AmpScaler: ...@@ -299,12 +299,16 @@ class AmpScaler:
param_grads_fp32.append(param._grad_ivar()) param_grads_fp32.append(param._grad_ivar())
else: else:
if in_dygraph_mode(): if in_dygraph_mode():
# It is very time-consuming to call c++ functions in a loop on the python side.
# We put this part of the code on the c++ side to improve the speed in eager mode.
( (
param_grads_fp16, param_grads_fp16,
param_grads_bf16, param_grads_bf16,
param_grads_fp32, param_grads_fp32,
) = core.eager.get_grads_lists(optimizer._parameter_list) ) = core.eager.get_grads_lists(optimizer._parameter_list)
else: else:
# Keep the original code to support legacy mode.
# Delete the else branch when the legacy mode exits.
param_grads = [ param_grads = [
param._grad_ivar() param._grad_ivar()
for param in optimizer._parameter_list for param in optimizer._parameter_list
......
...@@ -1028,14 +1028,25 @@ class Optimizer: ...@@ -1028,14 +1028,25 @@ class Optimizer:
if framework._non_static_mode(): if framework._non_static_mode():
parameter_list = parameters if parameters else self._parameter_list parameter_list = parameters if parameters else self._parameter_list
params_grads = [] if framework.in_dygraph_mode():
for param in parameter_list: # It is very time-consuming to call c++ functions in a loop on the python side.
if param.stop_gradient: # We put this part of the code on the c++ side to improve the speed in eager mode.
continue params_grads = []
if param._grad_ivar() is not None: grads = core.eager.get_all_grads(parameter_list)
# create gradient tensor for index, grad in enumerate(grads):
grad_var = param._grad_ivar() if grad is not None:
params_grads.append((param, grad_var)) params_grads.append((parameter_list[index], grad))
else:
# Keep the original code to support legacy mode.
# Delete the else branch when the legacy mode exits.
params_grads = []
for param in parameter_list:
if param.stop_gradient:
continue
if param._grad_ivar() is not None:
# create gradient tensor
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
else: else:
if callbacks is None: if callbacks is None:
callbacks = [error_clip_callback] callbacks = [error_clip_callback]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册