From 32c97a9d9090395ba0ec110d532e6b06215dc12f Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 21 Jul 2022 15:40:08 +0800 Subject: [PATCH] [Dy2Sta]Fix Segment Fault while training multi-card if params have no grad (#44485) * [Dy2Sta]Fix Segment Fault while training multi-card if params have no grad * fix unittest --- .../eager/to_static/run_program_op_func.h | 21 ++++++ .../dygraph_to_static/test_params_no_grad.py | 65 +++++++++++++++++++ 2 files changed, 86 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_params_no_grad.py diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 6b0a848350..fb8d64e377 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -21,6 +21,23 @@ #include "paddle/fluid/eager/to_static/run_program_op_node.h" #include "paddle/fluid/eager/utils.h" +// Filter params without grads in global block. In this case, we will +// tag its AutogradMeta with stop_gradient = True to avoid fault from +// reducer while training on multi-cards. +static void clear_no_grad_edges( + const std::vector& params, + const paddle::framework::BlockDesc* block_desc, + egr::GradNodeBase* grad_node, + size_t slot_id) { + for (size_t i = 0; i < params.size(); ++i) { + auto p_grad_name = paddle::framework::GradVarName(params[i].name()); + if (!block_desc->HasVar(p_grad_name)) { + VLOG(1) << "clear edge of " << p_grad_name; + grad_node->MutableOutputMeta()[slot_id][i].GetMutableEdge().Clear(); + } + } +} + inline void run_program_dygraph_function( const std::vector& x, const std::vector& params, @@ -61,12 +78,16 @@ inline void run_program_dygraph_function( grad_node->SetAttrMap(attrs); // Set TensorWrappers grad_node->SetFwdX(x); + grad_node->SetFwdParams(params); grad_node->SetStepScope(step_scope); // Set Grad out rank as same as fwd input and set stop gradient to bwd grad_node->SetGradOutMeta(x, /*slot id*/ 0); grad_node->SetGradOutMeta(params, /*slot id*/ 1); + auto* global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc*, + attrs.at("global_block")); + clear_no_grad_edges(params, global_block, grad_node.get(), /*slot id*/ 1); grad_node->SetGradInMeta(deref_out, 0); diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_params_no_grad.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_params_no_grad.py new file mode 100644 index 0000000000..f44faa642a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_params_no_grad.py @@ -0,0 +1,65 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.nn as nn +import paddle.distributed as dist + +import unittest + + +class Net(nn.Layer): + + def __init__(self): + super(Net, self).__init__() + self.emb1 = nn.Embedding(100, 16) + self.emb2 = nn.Embedding(100, 16) + + def forward(self, ids): + feat1 = self.emb1(ids) + feat1.stop_gradient = True # here + + feat2 = self.emb2(ids) + + out = feat1 + feat2 + out = paddle.mean(out) + return out + + +def train(): + paddle.distributed.init_parallel_env() + net = Net() + net = paddle.jit.to_static(net) + + sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=net.parameters()) + dp_net = paddle.DataParallel(net) + for i in range(4): + x = paddle.randint(low=0, high=100, shape=[4, 10]) + loss = dp_net(x) + loss.backward() + sgd.step() + loss.clear_gradient() + print(loss) + + +class TestParamsNoGrad(unittest.TestCase): + + def test_two_card(self): + if paddle.is_compiled_with_cuda() and len( + paddle.static.cuda_places()) > 1: + dist.spawn(train, nprocs=2, gpus='0,1') + + +if __name__ == '__main__': + unittest.main() -- GitLab