未验证 提交 261ebb0c 编写于 作者: W WangZhen 提交者: GitHub

Get grads from cpp for optimizer to avoid gpu idel time (#47709)

* Get params and grads in cpp to avoid gpu idel time

* Using python param instead of cpp return param to fix test_asp_optimize_dynamic.py

* Get grads from cpp and construct params_grads on python

* Check meta and remove comments
上级 1631836f
......@@ -190,18 +190,41 @@ static PyObject* eager_api_tensor_copy(PyObject* self,
EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyObject* eager_api_get_all_grads(PyObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
std::vector<paddle::experimental::Tensor> ret;
for (auto& tensor : tensor_list) {
VLOG(6) << "Get grad for tensor: " << tensor.name();
auto meta = egr::EagerUtils::nullable_autograd_meta(tensor);
if (!meta || meta->StopGradient()) {
ret.emplace_back(paddle::experimental::Tensor());
continue;
}
if (meta && meta->Grad().initialized()) {
ret.emplace_back(meta->Grad());
} else {
ret.emplace_back(paddle::experimental::Tensor());
}
}
return ToPyObject(ret, true);
EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyObject* eager_api_get_grads_lists(PyObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto tensor_list = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0);
// The order of the 3 vectors is: FP16_grads, BF16_grads, FP32_grads
std::vector<std::vector<paddle::experimental::Tensor>> ret(3);
for (auto& tensor : tensor_list) {
VLOG(6) << "Get grad for tensor: " << tensor.name();
auto meta = egr::EagerUtils::nullable_autograd_meta(tensor);
VLOG(6) << meta << " initialized: " << meta->Grad().initialized();
if (meta && meta->Grad().initialized()) {
auto& grad = meta->Grad();
switch (grad.dtype()) {
......@@ -1036,6 +1059,10 @@ PyMethodDef variable_functions[] = {
(PyCFunction)(void (*)(void))eager_api_tensor_copy,
METH_VARARGS | METH_KEYWORDS,
NULL},
{"get_all_grads",
(PyCFunction)(void (*)(void))eager_api_get_all_grads,
METH_VARARGS | METH_KEYWORDS,
NULL},
{"get_grads_lists",
(PyCFunction)(void (*)(void))eager_api_get_grads_lists,
METH_VARARGS | METH_KEYWORDS,
......
......@@ -299,12 +299,16 @@ class AmpScaler:
param_grads_fp32.append(param._grad_ivar())
else:
if in_dygraph_mode():
# It is very time-consuming to call c++ functions in a loop on the python side.
# We put this part of the code on the c++ side to improve the speed in eager mode.
(
param_grads_fp16,
param_grads_bf16,
param_grads_fp32,
) = core.eager.get_grads_lists(optimizer._parameter_list)
else:
# Keep the original code to support legacy mode.
# Delete the else branch when the legacy mode exits.
param_grads = [
param._grad_ivar()
for param in optimizer._parameter_list
......
......@@ -1028,14 +1028,25 @@ class Optimizer:
if framework._non_static_mode():
parameter_list = parameters if parameters else self._parameter_list
params_grads = []
for param in parameter_list:
if param.stop_gradient:
continue
if param._grad_ivar() is not None:
# create gradient tensor
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
if framework.in_dygraph_mode():
# It is very time-consuming to call c++ functions in a loop on the python side.
# We put this part of the code on the c++ side to improve the speed in eager mode.
params_grads = []
grads = core.eager.get_all_grads(parameter_list)
for index, grad in enumerate(grads):
if grad is not None:
params_grads.append((parameter_list[index], grad))
else:
# Keep the original code to support legacy mode.
# Delete the else branch when the legacy mode exits.
params_grads = []
for param in parameter_list:
if param.stop_gradient:
continue
if param._grad_ivar() is not None:
# create gradient tensor
grad_var = param._grad_ivar()
params_grads.append((param, grad_var))
else:
if callbacks is None:
callbacks = [error_clip_callback]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册