未验证 提交 32c97a9d 编写于 作者: A Aurelius84 提交者: GitHub

[Dy2Sta]Fix Segment Fault while training multi-card if params have no grad (#44485)

* [Dy2Sta]Fix Segment Fault while training multi-card if params have no grad

* fix unittest
上级 85c6937b
......@@ -21,6 +21,23 @@
#include "paddle/fluid/eager/to_static/run_program_op_node.h"
#include "paddle/fluid/eager/utils.h"
// Filter params without grads in global block. In this case, we will
// tag its AutogradMeta with stop_gradient = True to avoid fault from
// reducer while training on multi-cards.
static void clear_no_grad_edges(
const std::vector<paddle::experimental::Tensor>& params,
const paddle::framework::BlockDesc* block_desc,
egr::GradNodeBase* grad_node,
size_t slot_id) {
for (size_t i = 0; i < params.size(); ++i) {
auto p_grad_name = paddle::framework::GradVarName(params[i].name());
if (!block_desc->HasVar(p_grad_name)) {
VLOG(1) << "clear edge of " << p_grad_name;
grad_node->MutableOutputMeta()[slot_id][i].GetMutableEdge().Clear();
}
}
}
inline void run_program_dygraph_function(
const std::vector<paddle::experimental::Tensor>& x,
const std::vector<paddle::experimental::Tensor>& params,
......@@ -61,12 +78,16 @@ inline void run_program_dygraph_function(
grad_node->SetAttrMap(attrs);
// Set TensorWrappers
grad_node->SetFwdX(x);
grad_node->SetFwdParams(params);
grad_node->SetStepScope(step_scope);
// Set Grad out rank as same as fwd input and set stop gradient to bwd
grad_node->SetGradOutMeta(x, /*slot id*/ 0);
grad_node->SetGradOutMeta(params, /*slot id*/ 1);
auto* global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc*,
attrs.at("global_block"));
clear_no_grad_edges(params, global_block, grad_node.get(), /*slot id*/ 1);
grad_node->SetGradInMeta(deref_out, 0);
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import paddle.nn as nn
import paddle.distributed as dist
import unittest
class Net(nn.Layer):
def __init__(self):
super(Net, self).__init__()
self.emb1 = nn.Embedding(100, 16)
self.emb2 = nn.Embedding(100, 16)
def forward(self, ids):
feat1 = self.emb1(ids)
feat1.stop_gradient = True # here
feat2 = self.emb2(ids)
out = feat1 + feat2
out = paddle.mean(out)
return out
def train():
paddle.distributed.init_parallel_env()
net = Net()
net = paddle.jit.to_static(net)
sgd = paddle.optimizer.SGD(learning_rate=0.1, parameters=net.parameters())
dp_net = paddle.DataParallel(net)
for i in range(4):
x = paddle.randint(low=0, high=100, shape=[4, 10])
loss = dp_net(x)
loss.backward()
sgd.step()
loss.clear_gradient()
print(loss)
class TestParamsNoGrad(unittest.TestCase):
def test_two_card(self):
if paddle.is_compiled_with_cuda() and len(
paddle.static.cuda_places()) > 1:
dist.spawn(train, nprocs=2, gpus='0,1')
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册