From b106c4246eb325287fe8a3f65646bb81fb3bdfad Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Tue, 27 Sep 2022 11:21:15 +0800 Subject: [PATCH] [Eager] refine gil use (#46452) * refine gil use --- paddle/fluid/eager/pylayer/py_layer_node.cc | 5 + paddle/fluid/eager/pylayer/py_layer_node.h | 2 +- paddle/fluid/pybind/eager_functions.cc | 811 ++++++++++---------- paddle/fluid/pybind/eager_method.cc | 97 ++- 4 files changed, 487 insertions(+), 428 deletions(-) diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc index 6fb78d20e8a..0e89bab9a08 100644 --- a/paddle/fluid/eager/pylayer/py_layer_node.cc +++ b/paddle/fluid/eager/pylayer/py_layer_node.cc @@ -27,6 +27,11 @@ #include "pybind11/pytypes.h" namespace egr { +GradNodePyLayer::~GradNodePyLayer() { + pybind11::gil_scoped_acquire gil; + Py_XDECREF(ctx_); +} + paddle::small_vector, kSlotSmallVectorSize> GradNodePyLayer::operator()( diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h index 18c48b62c4f..8c93eb944f7 100644 --- a/paddle/fluid/eager/pylayer/py_layer_node.h +++ b/paddle/fluid/eager/pylayer/py_layer_node.h @@ -34,7 +34,7 @@ class GradNodePyLayer : public GradNodeBase { Py_INCREF(ctx_); } - ~GradNodePyLayer() override { Py_XDECREF(ctx_); }; + ~GradNodePyLayer() override; virtual paddle::small_vector, kSlotSmallVectorSize> diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 956d8e5814c..64a697f0094 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -107,12 +107,18 @@ static PyObject* eager_api_scale(PyObject* self, PyObject* kwargs) { EAGER_TRY // TODO(jiabin): Sync Tensor and Variable here when we support - paddle::experimental::Tensor ret = egr::scale( - reinterpret_cast(PyTuple_GET_ITEM(args, 0))->tensor, - CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 1), 1), - CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 2), 2), - CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3), - CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4)); + + auto& tensor = + reinterpret_cast(PyTuple_GET_ITEM(args, 0))->tensor; + float scale = CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 1), 1); + float bias = CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 2), 2); + bool bias_after_scale = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); + bool trace_backward = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4); + paddle::experimental::Tensor ret; + { + eager_gil_scoped_release guard; + ret = egr::scale(tensor, scale, bias, bias_after_scale, trace_backward); + } return ToPyObject(ret); EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -123,11 +129,10 @@ static PyObject* eager_api_run_backward(PyObject* self, EAGER_TRY auto tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 0), 0); auto grad_tensors = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); + bool retain_graph = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2); { eager_gil_scoped_release guard; - egr::Backward(tensors, - grad_tensors, - CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2)); + egr::Backward(tensors, grad_tensors, retain_graph); } RETURN_PY_NONE EAGER_CATCH_AND_THROW_RETURN_NULL @@ -156,8 +161,8 @@ static PyObject* eager_api_run_partial_grad(PyObject* self, only_inputs, allow_unused, no_grad_vars); + VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad"; } - VLOG(1) << " in eager_api_run_partial_grad, after runing egr::Grad"; return ToPyObject(result, true /* return_py_none_if_not_initialize */); EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -173,11 +178,14 @@ static PyObject* eager_api_tensor_copy(PyObject* self, auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2); bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); - dst = src.copy_to(place, blocking); - egr::EagerUtils::autograd_meta(&dst)->SetStopGradient( - egr::EagerUtils::autograd_meta(&(src))->StopGradient()); - egr::EagerUtils::autograd_meta(&dst)->SetPersistable( - egr::EagerUtils::autograd_meta(&(src))->Persistable()); + { + eager_gil_scoped_release guard; + dst = src.copy_to(place, blocking); + egr::EagerUtils::autograd_meta(&dst)->SetStopGradient( + egr::EagerUtils::autograd_meta(&(src))->StopGradient()); + egr::EagerUtils::autograd_meta(&dst)->SetPersistable( + egr::EagerUtils::autograd_meta(&(src))->Persistable()); + } RETURN_PY_NONE EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -378,7 +386,11 @@ static PyObject* eager_api_jit_function_call(PyObject* self, CastPyArg2JitFunction(PyTuple_GET_ITEM(args, 0), 0); std::vector ins = CastPyArg2VectorOfTensor(PyTuple_GET_ITEM(args, 1), 1); - std::vector outs = (*function)(ins); + std::vector outs; + { + eager_gil_scoped_release guard; + outs = (*function)(ins); + } return ToPyObject(outs); EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -391,116 +403,120 @@ static PyObject* eager_api_run_costum_op(PyObject* self, CastPyArg2CustomOpKernelContext(PyTuple_GET_ITEM(args, 0), 0); std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1); bool trace_backward = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2); - VLOG(7) << "Get things for python for Custom Op: " << op_type - << ", trace_backward is: " << trace_backward; - auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap(); - PADDLE_ENFORCE_NE(meta_info_map.find(op_type), - meta_info_map.end(), - paddle::platform::errors::NotFound( - "Can't find %s in Eager OpMetaInfoMap which should be " - "created by LoadOpMetaInfoAndRegisterOp, please make " - "sure you registered your op first and try again. ", - op_type)); - VLOG(7) << "Run Kernel of Custom Op: " << op_type; - std::vector res_attrs = - CastAttrsToTragetType(ctx.Attrs(), - paddle::framework::OpMetaInfoHelper::GetAttrs( - meta_info_map.at(op_type)[0])); - ctx.EmplaceBackAttrs(res_attrs); - const auto& vec_map = meta_info_map.at(op_type); - (*paddle::framework::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx); + { + eager_gil_scoped_release guard; + VLOG(7) << "Get things for python for Custom Op: " << op_type + << ", trace_backward is: " << trace_backward; + auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap(); + PADDLE_ENFORCE_NE( + meta_info_map.find(op_type), + meta_info_map.end(), + paddle::platform::errors::NotFound( + "Can't find %s in Eager OpMetaInfoMap which should be " + "created by LoadOpMetaInfoAndRegisterOp, please make " + "sure you registered your op first and try again. ", + op_type)); + VLOG(7) << "Run Kernel of Custom Op: " << op_type; + std::vector res_attrs = + CastAttrsToTragetType(ctx.Attrs(), + paddle::framework::OpMetaInfoHelper::GetAttrs( + meta_info_map.at(op_type)[0])); + ctx.EmplaceBackAttrs(res_attrs); + const auto& vec_map = meta_info_map.at(op_type); + (*paddle::framework::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx); - VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op"; - std::vector> ins_auto_grad_metas; - std::vector> outs_auto_grad_metas; - VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size(); - ins_auto_grad_metas.resize(ctx.InputRange().size()); - VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size(); - outs_auto_grad_metas.resize(ctx.OutputRange().size()); + VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op"; + std::vector> ins_auto_grad_metas; + std::vector> outs_auto_grad_metas; + VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size(); + ins_auto_grad_metas.resize(ctx.InputRange().size()); + VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size(); + outs_auto_grad_metas.resize(ctx.OutputRange().size()); - for (size_t i = 0; i < ctx.InputRange().size(); i++) { - ins_auto_grad_metas[i] = - egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween( - ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second)); - } - for (size_t i = 0; i < ctx.OutputRange().size(); i++) { - outs_auto_grad_metas[i] = - egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen( - ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second)); - } - bool require_any_grad = false; - for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { - require_any_grad = - require_any_grad || egr::EagerUtils::ComputeRequireGrad( - trace_backward, &(ins_auto_grad_metas[i])); - } - if (require_any_grad && (vec_map.size() > 1)) { - VLOG(6) << " Construct Grad for Custom Op: " << op_type; - ConstructFwdAndBwdMap(vec_map, op_type); - for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { - egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i])); + for (size_t i = 0; i < ctx.InputRange().size(); i++) { + ins_auto_grad_metas[i] = + egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween( + ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second)); + } + for (size_t i = 0; i < ctx.OutputRange().size(); i++) { + outs_auto_grad_metas[i] = + egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen( + ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second)); } - auto grad_node = std::make_shared( - outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type); - auto slot_map = - egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type); - // Prepare Grad outputs - size_t no_grad_cnt = 0; + bool require_any_grad = false; for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { - const std::vector& in_tensors = - ctx.InputsBetween(ctx.InputRangeAt(i).first, - ctx.InputRangeAt(i).second); + require_any_grad = + require_any_grad || egr::EagerUtils::ComputeRequireGrad( + trace_backward, &(ins_auto_grad_metas[i])); + } + if (require_any_grad && (vec_map.size() > 1)) { + VLOG(6) << " Construct Grad for Custom Op: " << op_type; + ConstructFwdAndBwdMap(vec_map, op_type); + for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { + egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i])); + } + auto grad_node = std::make_shared( + outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type); + auto slot_map = + egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type); + // Prepare Grad outputs + size_t no_grad_cnt = 0; + for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { + const std::vector& in_tensors = + ctx.InputsBetween(ctx.InputRangeAt(i).first, + ctx.InputRangeAt(i).second); - if (slot_map[0][0].find(i) != slot_map[0][0].end()) { - grad_node->SetGradOutMeta(in_tensors, slot_map[0][0][i]); - } else { - grad_node->SetGradOutMeta(in_tensors, - ins_auto_grad_metas.size() - 1 - no_grad_cnt); - no_grad_cnt++; + if (slot_map[0][0].find(i) != slot_map[0][0].end()) { + grad_node->SetGradOutMeta(in_tensors, slot_map[0][0][i]); + } else { + grad_node->SetGradOutMeta( + in_tensors, ins_auto_grad_metas.size() - 1 - no_grad_cnt); + no_grad_cnt++; + } } - } - // Prepare Grad inputs with grad of fwd outputs - for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { - const std::vector& out_tensors = - ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first, - ctx.OutputRangeAt(i).second); + // Prepare Grad inputs with grad of fwd outputs + for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { + const std::vector& out_tensors = + ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first, + ctx.OutputRangeAt(i).second); - egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i); - egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node); - grad_node->SetGradInMeta(out_tensors, i); - egr::EagerUtils::CheckAndRetainGrad(out_tensors); - } + egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i); + egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node); + grad_node->SetGradInMeta(out_tensors, i); + egr::EagerUtils::CheckAndRetainGrad(out_tensors); + } - // Prepare Grad inputs with fwd outputs - for (auto it = slot_map[0][2].begin(); it != slot_map[0][2].end(); it++) { - VLOG(7) << "Prepare fwd_outs: " << it->first - << " to grad_inputs: " << it->second; - grad_node->fwd_outs[it->second] = - egr::RunCustomOpNode::ConstructTensorWrapper( - ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first, - ctx.OutputRangeAt(it->first).second)); - } + // Prepare Grad inputs with fwd outputs + for (auto it = slot_map[0][2].begin(); it != slot_map[0][2].end(); it++) { + VLOG(7) << "Prepare fwd_outs: " << it->first + << " to grad_inputs: " << it->second; + grad_node->fwd_outs[it->second] = + egr::RunCustomOpNode::ConstructTensorWrapper( + ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first, + ctx.OutputRangeAt(it->first).second)); + } - // Prepare Grad inputs with fwd inputs - for (auto it = slot_map[0][3].begin(); it != slot_map[0][3].end(); it++) { - VLOG(7) << "Prepare fwd_ins: " << it->first - << " to grad_inputs: " << it->second; - grad_node->fwd_ins[it->second] = - egr::RunCustomOpNode::ConstructTensorWrapper( - ctx.InputsBetween(ctx.InputRangeAt(it->first).first, - ctx.InputRangeAt(it->first).second)); - } + // Prepare Grad inputs with fwd inputs + for (auto it = slot_map[0][3].begin(); it != slot_map[0][3].end(); it++) { + VLOG(7) << "Prepare fwd_ins: " << it->first + << " to grad_inputs: " << it->second; + grad_node->fwd_ins[it->second] = + egr::RunCustomOpNode::ConstructTensorWrapper( + ctx.InputsBetween(ctx.InputRangeAt(it->first).first, + ctx.InputRangeAt(it->first).second)); + } - auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs( - meta_info_map.at(op_type)[1]); - std::vector attrs(attrs_names.size()); - // Prepare attrs for Grad node - for (auto it = slot_map[0][4].begin(); it != slot_map[0][4].end(); it++) { - VLOG(7) << "Prepare fwd attrs: " << it->first - << " to grad_attrs: " << it->second; - attrs[it->second] = res_attrs[it->first]; + auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs( + meta_info_map.at(op_type)[1]); + std::vector attrs(attrs_names.size()); + // Prepare attrs for Grad node + for (auto it = slot_map[0][4].begin(); it != slot_map[0][4].end(); it++) { + VLOG(7) << "Prepare fwd attrs: " << it->first + << " to grad_attrs: " << it->second; + attrs[it->second] = res_attrs[it->first]; + } + grad_node->SetAttrs(attrs); } - grad_node->SetAttrs(attrs); } RETURN_PY_NONE EAGER_CATCH_AND_THROW_RETURN_NULL @@ -514,33 +530,36 @@ static PyObject* eager_api_sparse_coo_tensor(PyObject* self, auto non_zero_elements = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 1), 1); auto dense_shape = CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 2), 2); auto stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); - PADDLE_ENFORCE(non_zero_indices.is_dense_tensor(), - paddle::platform::errors::Fatal( - "the non-zero indices must be a DenseTensor.")); - PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(), - paddle::platform::errors::Fatal( - "the non-zero elements must be a DenseTensor.")); - auto dense_indices = - std::dynamic_pointer_cast(non_zero_indices.impl()); - auto dense_elements = - std::dynamic_pointer_cast(non_zero_elements.impl()); - // TODO(zhangkaihuo): After creating SparseCooTensor, call coalesced() to sort - // and merge duplicate indices - std::shared_ptr coo_tensor = - std::make_shared( - *dense_indices, *dense_elements, phi::make_ddim(dense_shape)); paddle::experimental::Tensor tensor; - tensor.set_impl(coo_tensor); - auto name = - egr::Controller::Instance().GenerateUniqueName("generated_tensor"); - tensor.set_name(name); - auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor); - autograd_meta->SetStopGradient(static_cast(stop_gradient)); - if (!autograd_meta->GetMutableGradNode()) { - VLOG(3) << "Tensor(" << name - << ") doesn't have GradNode, add GradNodeAccumulation to it."; - autograd_meta->SetGradNode( - std::make_shared(autograd_meta)); + { + eager_gil_scoped_release guard; + PADDLE_ENFORCE(non_zero_indices.is_dense_tensor(), + paddle::platform::errors::Fatal( + "the non-zero indices must be a DenseTensor.")); + PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(), + paddle::platform::errors::Fatal( + "the non-zero elements must be a DenseTensor.")); + auto dense_indices = + std::dynamic_pointer_cast(non_zero_indices.impl()); + auto dense_elements = + std::dynamic_pointer_cast(non_zero_elements.impl()); + // TODO(zhangkaihuo): After creating SparseCooTensor, call coalesced() to + // sort and merge duplicate indices + std::shared_ptr coo_tensor = + std::make_shared( + *dense_indices, *dense_elements, phi::make_ddim(dense_shape)); + tensor.set_impl(coo_tensor); + auto name = + egr::Controller::Instance().GenerateUniqueName("generated_tensor"); + tensor.set_name(name); + auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor); + autograd_meta->SetStopGradient(static_cast(stop_gradient)); + if (!autograd_meta->GetMutableGradNode()) { + VLOG(3) << "Tensor(" << name + << ") doesn't have GradNode, add GradNodeAccumulation to it."; + autograd_meta->SetGradNode( + std::make_shared(autograd_meta)); + } } return ToPyObject(tensor); EAGER_CATCH_AND_THROW_RETURN_NULL @@ -555,39 +574,42 @@ static PyObject* eager_api_sparse_csr_tensor(PyObject* self, auto non_zero_elements = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 2), 2); auto dense_shape = CastPyArg2VectorOfInt(PyTuple_GET_ITEM(args, 3), 3); auto stop_gradient = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4); - PADDLE_ENFORCE(non_zero_crows.is_dense_tensor(), - paddle::platform::errors::Fatal( - "the compressed non-zero rows must be a DenseTensor.")); - PADDLE_ENFORCE(non_zero_cols.is_dense_tensor(), - paddle::platform::errors::Fatal( - "the non-zero cols must be a DenseTensor.")); - PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(), - paddle::platform::errors::Fatal( - "the non-zero elements must be a DenseTensor.")); - - auto dense_crows = - std::dynamic_pointer_cast(non_zero_crows.impl()); - auto dense_cols = - std::dynamic_pointer_cast(non_zero_cols.impl()); - auto dense_elements = - std::dynamic_pointer_cast(non_zero_elements.impl()); - std::shared_ptr csr_tensor = - std::make_shared(*dense_crows, - *dense_cols, - *dense_elements, - phi::make_ddim(dense_shape)); paddle::experimental::Tensor tensor; - tensor.set_impl(csr_tensor); - auto name = - egr::Controller::Instance().GenerateUniqueName("generated_tensor"); - tensor.set_name(name); - auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor); - autograd_meta->SetStopGradient(static_cast(stop_gradient)); - if (!autograd_meta->GetMutableGradNode()) { - VLOG(3) << "Tensor(" << name - << ") have not GradNode, add GradNodeAccumulation for it."; - autograd_meta->SetGradNode( - std::make_shared(autograd_meta)); + { + eager_gil_scoped_release guard; + PADDLE_ENFORCE(non_zero_crows.is_dense_tensor(), + paddle::platform::errors::Fatal( + "the compressed non-zero rows must be a DenseTensor.")); + PADDLE_ENFORCE(non_zero_cols.is_dense_tensor(), + paddle::platform::errors::Fatal( + "the non-zero cols must be a DenseTensor.")); + PADDLE_ENFORCE(non_zero_elements.is_dense_tensor(), + paddle::platform::errors::Fatal( + "the non-zero elements must be a DenseTensor.")); + + auto dense_crows = + std::dynamic_pointer_cast(non_zero_crows.impl()); + auto dense_cols = + std::dynamic_pointer_cast(non_zero_cols.impl()); + auto dense_elements = + std::dynamic_pointer_cast(non_zero_elements.impl()); + std::shared_ptr csr_tensor = + std::make_shared(*dense_crows, + *dense_cols, + *dense_elements, + phi::make_ddim(dense_shape)); + tensor.set_impl(csr_tensor); + auto name = + egr::Controller::Instance().GenerateUniqueName("generated_tensor"); + tensor.set_name(name); + auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor); + autograd_meta->SetStopGradient(static_cast(stop_gradient)); + if (!autograd_meta->GetMutableGradNode()) { + VLOG(3) << "Tensor(" << name + << ") have not GradNode, add GradNodeAccumulation for it."; + autograd_meta->SetGradNode( + std::make_shared(autograd_meta)); + } } return ToPyObject(tensor); EAGER_CATCH_AND_THROW_RETURN_NULL @@ -626,87 +648,215 @@ static PyObject* eager_api_async_read(PyObject* self, auto& buffer = GetTensorFromArgs("async_read", "buffer", args, 3, false); auto& offset = GetTensorFromArgs("async_read", "offset", args, 4, false); auto& count = GetTensorFromArgs("async_read", "count", args, 5, false); - PADDLE_ENFORCE_EQ( - src.is_gpu_pinned(), - true, - platform::errors::InvalidArgument("Required `src` device should be " - "CUDAPinnedPlace, but received %d.", - src.place())); - PADDLE_ENFORCE_EQ( - dst.is_gpu(), - true, - platform::errors::InvalidArgument( - "Required `dst` device should be CUDAPlace, but received %d.", - dst.place())); - PADDLE_ENFORCE_EQ( - index.is_cpu(), - true, - platform::errors::InvalidArgument( - "Required `index` device should be CPUPlace, but received %d.", - index.place())); - PADDLE_ENFORCE_EQ(buffer.is_gpu_pinned(), - true, - platform::errors::InvalidArgument( - "Required `buffer` device should be CUDAPinnedPlace, " - "but received %d.", - buffer.place())); - PADDLE_ENFORCE_EQ( - offset.is_cpu(), - true, - platform::errors::InvalidArgument( - "Required `offset` device should be CPUPlace, but received %d.", - offset.place())); - PADDLE_ENFORCE_EQ( - count.is_cpu(), - true, - platform::errors::InvalidArgument( - "Required `count` device should be CPUPlace, but received %d.", - count.place())); + { + eager_gil_scoped_release guard; + PADDLE_ENFORCE_EQ( + src.is_gpu_pinned(), + true, + platform::errors::InvalidArgument("Required `src` device should be " + "CUDAPinnedPlace, but received %d.", + src.place())); + PADDLE_ENFORCE_EQ( + dst.is_gpu(), + true, + platform::errors::InvalidArgument( + "Required `dst` device should be CUDAPlace, but received %d.", + dst.place())); + PADDLE_ENFORCE_EQ( + index.is_cpu(), + true, + platform::errors::InvalidArgument( + "Required `index` device should be CPUPlace, but received %d.", + index.place())); + PADDLE_ENFORCE_EQ(buffer.is_gpu_pinned(), + true, + platform::errors::InvalidArgument( + "Required `buffer` device should be CUDAPinnedPlace, " + "but received %d.", + buffer.place())); + PADDLE_ENFORCE_EQ( + offset.is_cpu(), + true, + platform::errors::InvalidArgument( + "Required `offset` device should be CPUPlace, but received %d.", + offset.place())); + PADDLE_ENFORCE_EQ( + count.is_cpu(), + true, + platform::errors::InvalidArgument( + "Required `count` device should be CPUPlace, but received %d.", + count.place())); - auto& src_tensor = src; - auto* dst_tensor = &dst; - auto& index_tensor = index; - auto* buffer_tensor = &buffer; - auto& offset_tensor = offset; - auto& count_tensor = count; - auto* dst_data = dst_tensor->mutable_data(dst.place()); - const auto& deviceId = paddle::platform::GetCurrentDeviceId(); + auto& src_tensor = src; + auto* dst_tensor = &dst; + auto& index_tensor = index; + auto* buffer_tensor = &buffer; + auto& offset_tensor = offset; + auto& count_tensor = count; + auto* dst_data = dst_tensor->mutable_data(dst.place()); + const auto& deviceId = paddle::platform::GetCurrentDeviceId(); - PADDLE_ENFORCE_EQ(src_tensor.dims().size(), - dst_tensor->dims().size(), - platform::errors::InvalidArgument( - "`src` and `dst` should have same tensor shape, " - "except for the first dimension.")); - PADDLE_ENFORCE_EQ(src_tensor.dims().size(), - buffer_tensor->dims().size(), - platform::errors::InvalidArgument( - "`src` and `buffer` should have same tensor shape, " - "except for the first dimension.")); - for (int i = 1; i < src_tensor.dims().size(); i++) { - PADDLE_ENFORCE_EQ(src_tensor.dims()[i], - dst_tensor->dims()[i], + PADDLE_ENFORCE_EQ(src_tensor.dims().size(), + dst_tensor->dims().size(), platform::errors::InvalidArgument( - "`src` and `dst` should have the same tensor shape, " + "`src` and `dst` should have same tensor shape, " "except for the first dimension.")); - PADDLE_ENFORCE_EQ( - src_tensor.dims()[i], - buffer_tensor->dims()[i], - platform::errors::InvalidArgument( - "`src` and `buffer` should have the same tensor shape, " - "except for the first dimension.")); + PADDLE_ENFORCE_EQ(src_tensor.dims().size(), + buffer_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `buffer` should have same tensor shape, " + "except for the first dimension.")); + for (int i = 1; i < src_tensor.dims().size(); i++) { + PADDLE_ENFORCE_EQ( + src_tensor.dims()[i], + dst_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + PADDLE_ENFORCE_EQ( + src_tensor.dims()[i], + buffer_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `buffer` should have the same tensor shape, " + "except for the first dimension.")); + } + PADDLE_ENFORCE_EQ(index_tensor.dims().size(), + 1, + platform::errors::InvalidArgument( + "`index` tensor should be one-dimensional.")); + + auto stream = paddle::platform::get_current_stream(deviceId)->raw_stream(); + + int64_t numel = 0; // total copy length + int64_t copy_flag = offset_tensor.dims()[0]; + int64_t size = src_tensor.numel() / src_tensor.dims()[0]; + + if (copy_flag != 0) { + PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), + 1, + platform::errors::InvalidArgument( + "`offset` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(count_tensor.dims().size(), + 1, + platform::errors::InvalidArgument( + "`count` tensor should be one-dimensional.")); + PADDLE_ENFORCE_EQ(offset_tensor.numel(), + count_tensor.numel(), + platform::errors::InvalidArgument( + "`offset` and `count` tensor size dismatch.")); + auto* offset_data = offset_tensor.data(); + auto* count_data = count_tensor.data(); + for (int64_t i = 0; i < count_tensor.numel(); i++) { + numel += count_data[i]; + } + PADDLE_ENFORCE_LE(numel + index_tensor.numel(), + buffer_tensor->dims()[0], + platform::errors::InvalidArgument( + "Buffer tensor size is too small.")); + PADDLE_ENFORCE_LE(numel + index_tensor.numel(), + dst_tensor->dims()[0], + platform::errors::InvalidArgument( + "Target tensor size is too small.")); + + int64_t src_offset, dst_offset = 0, c; + auto* src_data = src_tensor.data(); + for (int64_t i = 0; i < offset_tensor.numel(); i++) { + src_offset = offset_data[i], c = count_data[i]; + PADDLE_ENFORCE_LE(src_offset + c, + src_tensor.dims()[0], + platform::errors::InvalidArgument( + "Invalid offset or count index.")); + PADDLE_ENFORCE_LE(dst_offset + c, + dst_tensor->dims()[0], + platform::errors::InvalidArgument( + "Invalid offset or count index.")); + cudaMemcpyAsync(dst_data + (dst_offset * size), + src_data + (src_offset * size), + c * size * sizeof(float), + cudaMemcpyHostToDevice, + stream); + dst_offset += c; + } + } else { + PADDLE_ENFORCE_LE(index_tensor.numel(), + buffer_tensor->dims()[0], + platform::errors::InvalidArgument( + "Buffer tensor size is too small.")); + } + + // Select the index data to the buffer + auto index_select = [](const paddle::experimental::Tensor& src_tensor, + const paddle::experimental::Tensor& index_tensor, + paddle::experimental::Tensor* buffer_tensor) { + auto* src_data = src_tensor.data(); + auto* index_data = index_tensor.data(); + auto* buffer_data = buffer_tensor->data(); + const int& slice_size = src_tensor.numel() / src_tensor.dims()[0]; + const int& copy_bytes = slice_size * sizeof(float); + int64_t c = 0; + for (int64_t i = 0; i < index_tensor.numel(); i++) { + std::memcpy(buffer_data + c * slice_size, + src_data + index_data[i] * slice_size, + copy_bytes); + c += 1; + } + }; + index_select(src_tensor, index_tensor, buffer_tensor); + + // Copy the data to device memory + cudaMemcpyAsync(dst_data + (numel * size), + buffer_tensor->data(), + index_tensor.numel() * size * sizeof(float), + cudaMemcpyHostToDevice, + stream); } - PADDLE_ENFORCE_EQ(index_tensor.dims().size(), - 1, - platform::errors::InvalidArgument( - "`index` tensor should be one-dimensional.")); + RETURN_PY_NONE + EAGER_CATCH_AND_THROW_RETURN_NULL +} - auto stream = paddle::platform::get_current_stream(deviceId)->raw_stream(); +static PyObject* eager_api_async_write(PyObject* self, + PyObject* args, + PyObject* kwargs) { + EAGER_TRY + auto& src = GetTensorFromArgs("async_write", "src", args, 0, false); + auto& dst = GetTensorFromArgs("async_write", "dst", args, 1, false); + auto& offset = GetTensorFromArgs("async_write", "offset", args, 2, false); + auto& count = GetTensorFromArgs("async_write", "count", args, 3, false); + { + eager_gil_scoped_release guard; + PADDLE_ENFORCE_EQ( + src.is_gpu(), + true, + platform::errors::InvalidArgument( + "Required `src` device should be CUDAPlace, but received %d. ", + src.place())); + PADDLE_ENFORCE_EQ(dst.is_gpu_pinned(), + true, + platform::errors::InvalidArgument( + "Required `dst` device should be CUDAPinnedPlace, " + "but received %d. ", + dst.place())); + PADDLE_ENFORCE_EQ( + offset.is_cpu(), + true, + platform::errors::InvalidArgument("Required `offset` device should " + "be CPUPlace, but received %d. ", + offset.place())); + PADDLE_ENFORCE_EQ( + count.is_cpu(), + true, + platform::errors::InvalidArgument( + "Required `count` device should be CPUPlace, but received %d. ", + count.place())); - int64_t numel = 0; // total copy length - int64_t copy_flag = offset_tensor.dims()[0]; - int64_t size = src_tensor.numel() / src_tensor.dims()[0]; + // TODO(daisiming): In future, add index as arguments following + // async_read. + auto& src_tensor = src; + auto* dst_tensor = &dst; + auto& offset_tensor = offset; + auto& count_tensor = count; + const auto& deviceId = paddle::platform::GetCurrentDeviceId(); - if (copy_flag != 0) { PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), 1, platform::errors::InvalidArgument( @@ -719,165 +869,45 @@ static PyObject* eager_api_async_read(PyObject* self, count_tensor.numel(), platform::errors::InvalidArgument( "`offset` and `count` tensor size dismatch.")); - auto* offset_data = offset_tensor.data(); - auto* count_data = count_tensor.data(); - for (int64_t i = 0; i < count_tensor.numel(); i++) { - numel += count_data[i]; + PADDLE_ENFORCE_EQ(src_tensor.dims().size(), + dst_tensor->dims().size(), + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); + for (int i = 1; i < src_tensor.dims().size(); i++) { + PADDLE_ENFORCE_EQ( + src_tensor.dims()[i], + dst_tensor->dims()[i], + platform::errors::InvalidArgument( + "`src` and `dst` should have the same tensor shape, " + "except for the first dimension.")); } - PADDLE_ENFORCE_LE( - numel + index_tensor.numel(), - buffer_tensor->dims()[0], - platform::errors::InvalidArgument("Buffer tensor size is too small.")); - PADDLE_ENFORCE_LE( - numel + index_tensor.numel(), - dst_tensor->dims()[0], - platform::errors::InvalidArgument("Target tensor size is too small.")); - int64_t src_offset, dst_offset = 0, c; + auto stream = paddle::platform::get_current_stream(deviceId)->raw_stream(); + + int64_t size = src_tensor.numel() / src_tensor.dims()[0]; auto* src_data = src_tensor.data(); + auto* dst_data = dst_tensor->data(); + const int64_t* offset_data = offset_tensor.data(); + const int64_t* count_data = count_tensor.data(); + int64_t src_offset = 0, dst_offset, c; for (int64_t i = 0; i < offset_tensor.numel(); i++) { - src_offset = offset_data[i], c = count_data[i]; + dst_offset = offset_data[i], c = count_data[i]; PADDLE_ENFORCE_LE( src_offset + c, src_tensor.dims()[0], - platform::errors::InvalidArgument("Invalid offset or count index.")); + platform::errors::InvalidArgument("Invalid offset or count index")); PADDLE_ENFORCE_LE( dst_offset + c, dst_tensor->dims()[0], - platform::errors::InvalidArgument("Invalid offset or count index.")); + platform::errors::InvalidArgument("Invalid offset or count index")); cudaMemcpyAsync(dst_data + (dst_offset * size), src_data + (src_offset * size), c * size * sizeof(float), - cudaMemcpyHostToDevice, + cudaMemcpyDeviceToHost, stream); - dst_offset += c; - } - } else { - PADDLE_ENFORCE_LE( - index_tensor.numel(), - buffer_tensor->dims()[0], - platform::errors::InvalidArgument("Buffer tensor size is too small.")); - } - - // Select the index data to the buffer - auto index_select = [](const paddle::experimental::Tensor& src_tensor, - const paddle::experimental::Tensor& index_tensor, - paddle::experimental::Tensor* buffer_tensor) { - auto* src_data = src_tensor.data(); - auto* index_data = index_tensor.data(); - auto* buffer_data = buffer_tensor->data(); - const int& slice_size = src_tensor.numel() / src_tensor.dims()[0]; - const int& copy_bytes = slice_size * sizeof(float); - int64_t c = 0; - for (int64_t i = 0; i < index_tensor.numel(); i++) { - std::memcpy(buffer_data + c * slice_size, - src_data + index_data[i] * slice_size, - copy_bytes); - c += 1; + src_offset += c; } - }; - index_select(src_tensor, index_tensor, buffer_tensor); - - // Copy the data to device memory - cudaMemcpyAsync(dst_data + (numel * size), - buffer_tensor->data(), - index_tensor.numel() * size * sizeof(float), - cudaMemcpyHostToDevice, - stream); - RETURN_PY_NONE - EAGER_CATCH_AND_THROW_RETURN_NULL -} - -static PyObject* eager_api_async_write(PyObject* self, - PyObject* args, - PyObject* kwargs) { - EAGER_TRY - auto& src = GetTensorFromArgs("async_write", "src", args, 0, false); - auto& dst = GetTensorFromArgs("async_write", "dst", args, 1, false); - auto& offset = GetTensorFromArgs("async_write", "offset", args, 2, false); - auto& count = GetTensorFromArgs("async_write", "count", args, 3, false); - PADDLE_ENFORCE_EQ( - src.is_gpu(), - true, - platform::errors::InvalidArgument( - "Required `src` device should be CUDAPlace, but received %d. ", - src.place())); - PADDLE_ENFORCE_EQ(dst.is_gpu_pinned(), - true, - platform::errors::InvalidArgument( - "Required `dst` device should be CUDAPinnedPlace, " - "but received %d. ", - dst.place())); - PADDLE_ENFORCE_EQ( - offset.is_cpu(), - true, - platform::errors::InvalidArgument("Required `offset` device should " - "be CPUPlace, but received %d. ", - offset.place())); - PADDLE_ENFORCE_EQ( - count.is_cpu(), - true, - platform::errors::InvalidArgument( - "Required `count` device should be CPUPlace, but received %d. ", - count.place())); - - // TODO(daisiming): In future, add index as arguments following - // async_read. - auto& src_tensor = src; - auto* dst_tensor = &dst; - auto& offset_tensor = offset; - auto& count_tensor = count; - const auto& deviceId = paddle::platform::GetCurrentDeviceId(); - - PADDLE_ENFORCE_EQ(offset_tensor.dims().size(), - 1, - platform::errors::InvalidArgument( - "`offset` tensor should be one-dimensional.")); - PADDLE_ENFORCE_EQ(count_tensor.dims().size(), - 1, - platform::errors::InvalidArgument( - "`count` tensor should be one-dimensional.")); - PADDLE_ENFORCE_EQ(offset_tensor.numel(), - count_tensor.numel(), - platform::errors::InvalidArgument( - "`offset` and `count` tensor size dismatch.")); - PADDLE_ENFORCE_EQ(src_tensor.dims().size(), - dst_tensor->dims().size(), - platform::errors::InvalidArgument( - "`src` and `dst` should have the same tensor shape, " - "except for the first dimension.")); - for (int i = 1; i < src_tensor.dims().size(); i++) { - PADDLE_ENFORCE_EQ(src_tensor.dims()[i], - dst_tensor->dims()[i], - platform::errors::InvalidArgument( - "`src` and `dst` should have the same tensor shape, " - "except for the first dimension.")); - } - - auto stream = paddle::platform::get_current_stream(deviceId)->raw_stream(); - - int64_t size = src_tensor.numel() / src_tensor.dims()[0]; - auto* src_data = src_tensor.data(); - auto* dst_data = dst_tensor->data(); - const int64_t* offset_data = offset_tensor.data(); - const int64_t* count_data = count_tensor.data(); - int64_t src_offset = 0, dst_offset, c; - for (int64_t i = 0; i < offset_tensor.numel(); i++) { - dst_offset = offset_data[i], c = count_data[i]; - PADDLE_ENFORCE_LE( - src_offset + c, - src_tensor.dims()[0], - platform::errors::InvalidArgument("Invalid offset or count index")); - PADDLE_ENFORCE_LE( - dst_offset + c, - dst_tensor->dims()[0], - platform::errors::InvalidArgument("Invalid offset or count index")); - cudaMemcpyAsync(dst_data + (dst_offset * size), - src_data + (src_offset * size), - c * size * sizeof(float), - cudaMemcpyDeviceToHost, - stream); - src_offset += c; } RETURN_PY_NONE EAGER_CATCH_AND_THROW_RETURN_NULL @@ -929,7 +959,6 @@ static PyObject* eager_api_to_uva_tensor(PyObject* self, "float64, int8, int16, int32, int64," "please check your input or input array data type.")); } - return ToPyObject(*(new_tensor.get())); EAGER_CATCH_AND_THROW_RETURN_NULL } diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 3521a9d5399..1af905fc1e6 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -156,6 +156,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, } if (self->tensor.is_cpu() || self->tensor.is_gpu_pinned()) { + eager_gil_scoped_release guard; platform::CPUPlace place; if (self->tensor.is_selected_rows()) { VLOG(6) << "Getting SelectedRows's numpy value"; @@ -186,6 +187,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (self->tensor.is_gpu()) { + eager_gil_scoped_release guard; #if defined(PADDLE_WITH_CUDA) gpuMemcpyKind kind = cudaMemcpyDeviceToHost; #elif defined(PADDLE_WITH_HIP) @@ -244,6 +246,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE } else if (self->tensor.is_custom_device()) { + eager_gil_scoped_release guard; if (self->tensor.is_selected_rows()) { VLOG(6) << "Getting SelectedRows's numpy value"; auto* selected_rows = @@ -311,8 +314,8 @@ static PyObject* tensor_method_numpy_for_string_tensor(TensorObject* self, const auto* st_ptr = string_tensor->data(); auto numel = self->tensor.numel(); auto tensor_dims = self->tensor.shape(); - // Get the max unicode length of StringTensor to create numpy unicode string - // array. + // Get the max unicode length of StringTensor to create numpy unicode + // string array. auto* longest_pstring = std::max_element( st_ptr, st_ptr + numel, [](const auto& a, const auto& b) { auto a_unicode_len = @@ -394,14 +397,18 @@ static PyObject* tensor_method__copy_to(TensorObject* self, EAGER_TRY auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0); bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1); - auto cp_tensor = self->tensor.copy_to(place, blocking); - if (!blocking) { - IncreaseTensorReferenceCountUntilCopyComplete(self->tensor, place); + paddle::experimental::Tensor cp_tensor; + { + eager_gil_scoped_release guard; + cp_tensor = self->tensor.copy_to(place, blocking); + if (!blocking) { + IncreaseTensorReferenceCountUntilCopyComplete(self->tensor, place); + } + egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); + egr::EagerUtils::autograd_meta(&cp_tensor) + ->SetPersistable( + egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable()); } - egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); - egr::EagerUtils::autograd_meta(&cp_tensor) - ->SetPersistable( - egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable()); return ToPyObject(cp_tensor); EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -410,11 +417,15 @@ static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY - auto cp_tensor = self->tensor.copy_to(phi::CPUPlace(), true); - egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); - egr::EagerUtils::autograd_meta(&cp_tensor) - ->SetPersistable( - egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable()); + paddle::experimental::Tensor cp_tensor; + { + eager_gil_scoped_release guard; + cp_tensor = self->tensor.copy_to(phi::CPUPlace(), true); + egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); + egr::EagerUtils::autograd_meta(&cp_tensor) + ->SetPersistable( + egr::EagerUtils::autograd_meta(&(self->tensor))->Persistable()); + } return ToPyObject(cp_tensor); EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -450,6 +461,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, VLOG(6) << "Start Copy Tensor " << src_tensor.name() << " to " << self->tensor.name(); if (!self->tensor.initialized()) { + eager_gil_scoped_release guard; egr::EagerUtils::autograd_meta(&(self->tensor)) ->SetStopGradient( egr::EagerUtils::autograd_meta(&(src_tensor))->StopGradient()); @@ -461,6 +473,7 @@ static PyObject* tensor_method_copy_(TensorObject* self, } } else { if (src_tensor.initialized()) { + eager_gil_scoped_release guard; self->tensor.copy_(src_tensor, self->tensor.place(), blocking); } } @@ -476,16 +489,19 @@ static PyObject* tensor_method_clone(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY + paddle::experimental::Tensor out; + { + eager_gil_scoped_release guard; + PADDLE_ENFORCE_EQ( + self->tensor.initialized(), + true, + paddle::platform::errors::InvalidArgument( + "We can only support initialized tensor in clone, however we got " + "uninitialized tensor %s, please check your code.", + self->tensor.name())); - PADDLE_ENFORCE_EQ( - self->tensor.initialized(), - true, - paddle::platform::errors::InvalidArgument( - "We can only support initialized tensor in clone, however we got " - "uninitialized tensor %s, please check your code.", - self->tensor.name())); - - auto out = assign_ad_func(self->tensor); + out = assign_ad_func(self->tensor); + } return ToPyObject(out); EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -495,6 +511,7 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* kwargs) { EAGER_TRY if (egr::Controller::Instance().HasGrad()) { + eager_gil_scoped_release guard; auto meta = egr::EagerUtils::autograd_meta(&(self->tensor)); if (!meta->GetMutableGradNode()) { VLOG(6) << "Make grad node of tensor: " << self->tensor.name() @@ -535,6 +552,7 @@ static PyObject* tensor_clear_gradient(TensorObject* self, } if (grad->impl()) { + eager_gil_scoped_release guard; if (grad->is_selected_rows()) { auto selected_rows = std::dynamic_pointer_cast(grad->impl()); @@ -577,6 +595,7 @@ static PyObject* tensor__zero_grads(TensorObject* self, VLOG(4) << "ZeroGrads " << self->tensor.name(); if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { + eager_gil_scoped_release guard; // Add RetainGrad as PostHook to AccumulationNode paddle::experimental::Tensor* grad = egr::EagerUtils::mutable_grad(self->tensor); @@ -595,6 +614,7 @@ static PyObject* tensor__zero_grads(TensorObject* self, } } } else { + eager_gil_scoped_release guard; auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor); if (meta->MutableGrad()->initialized()) { if (meta->MutableGrad()->is_dense_tensor()) { @@ -855,6 +875,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, decrease_axis.end()); if (op_type == "slice") { + eager_gil_scoped_release guard; out = slice_ad_func(self->tensor, slice_axes_tmp, slice_starts, @@ -862,6 +883,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, infer_flags_tmp, decrease_axis_tmp); } else if (op_type == "strided_slice") { + eager_gil_scoped_release guard; out = strided_slice_ad_func( self->tensor, slice_axes, slice_starts, slice_ends, slice_strides); } else { @@ -886,28 +908,31 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, none_axes.pop_back(); } if (!none_axes.empty()) { - // Deal with cases that decrease_axes is not empty - // For example: - // # x.shape: (2,3,4) - // out = x[0, 0:2, None] # out.shape : (2, 1, 4) - for (auto& axis : none_axes) { - int len = 0; - for (int da : decrease_axis) { - if (da < axis) { - len++; + paddle::experimental::Tensor new_out; + { + eager_gil_scoped_release guard; + // Deal with cases that decrease_axes is not empty + // For example: + // # x.shape: (2,3,4) + // out = x[0, 0:2, None] # out.shape : (2, 1, 4) + for (auto& axis : none_axes) { + int len = 0; + for (int da : decrease_axis) { + if (da < axis) { + len++; + } } + axis -= len; } - axis -= len; + new_out = unsqueeze_ad_func(out, none_axes); } - - paddle::experimental::Tensor new_out; - new_out = unsqueeze_ad_func(out, none_axes); return ToPyObject(new_out); } } // the index is a list if (list_select_flag) { + eager_gil_scoped_release guard; auto select_index = paddle::experimental::Tensor( egr::Controller::Instance().GenerateUniqueName()); auto idx_tensor = std::make_shared(); -- GitLab