未验证 提交 664f958a 编写于 作者: C Chen Weihang 提交者: GitHub

Fix optimizer op infershape failed in dygraph multi-cards mode (#21374)

* add param & grad shape check for sgd op

* add _reshape_inplece interface for dygraph parallel

* refine unittest based paddle/models scripts, test=develop

* add unittest for parallel grad fuse, test=develop
上级 630be319
...@@ -40,8 +40,15 @@ class SGDOp : public framework::OperatorWithKernel { ...@@ -40,8 +40,15 @@ class SGDOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 element"); "Learning rate should have 1 element");
auto param_dim = ctx->GetInputDim("Param"); auto param_dim = ctx->GetInputDim("Param");
// TODO(qijun): check dimensions of Param and Grad at compile if (ctx->GetInputsVarType("Grad")[0] ==
// and runtime. framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(
param_dim, ctx->GetInputDim("Grad"),
platform::errors::InvalidArgument(
"SGD Operator's input Param and Grad dimensions do not match. "
"The Param shape is [%s], but the Grad shape is [%s].",
param_dim, ctx->GetInputDim("Grad")));
}
ctx->SetOutputDim("ParamOut", param_dim); ctx->SetOutputDim("ParamOut", param_dim);
} }
......
...@@ -184,6 +184,15 @@ class DataParallel(layers.Layer): ...@@ -184,6 +184,15 @@ class DataParallel(layers.Layer):
[coalesced_grad, grad_vars, g_var_shapes]) [coalesced_grad, grad_vars, g_var_shapes])
return coalesced_grads_and_grad_vars return coalesced_grads_and_grad_vars
def _reshape_inplace(self, x, shape):
x_shape = self._helper.create_variable_for_type_inference(dtype=x.dtype)
self._helper.append_op(
type="reshape2",
inputs={'X': x},
attrs={'shape': shape},
outputs={'Out': x,
'XShape': x_shape})
def _split_tensors(self, coalesced_grads_and_grad_vars): def _split_tensors(self, coalesced_grads_and_grad_vars):
from ..layers import nn from ..layers import nn
for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars: for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars:
...@@ -195,7 +204,8 @@ class DataParallel(layers.Layer): ...@@ -195,7 +204,8 @@ class DataParallel(layers.Layer):
attrs={'sections': grad_var_len, attrs={'sections': grad_var_len,
'axis': 0}) 'axis': 0})
for g_var, g_shape in zip(origin_grad_vars, grad_shapes): for g_var, g_shape in zip(origin_grad_vars, grad_shapes):
nn.reshape(x=g_var, shape=g_shape, inplace=True) self._reshape_inplace(x=g_var, shape=g_shape)
assert g_var.shape == g_shape
@no_grad @no_grad
def apply_collective_grads(self): def apply_collective_grads(self):
......
...@@ -114,7 +114,7 @@ class TestMnist(TestParallelDyGraphRunnerBase): ...@@ -114,7 +114,7 @@ class TestMnist(TestParallelDyGraphRunnerBase):
model = MNIST("mnist") model = MNIST("mnist")
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.mnist.train(), batch_size=2, drop_last=True) paddle.dataset.mnist.train(), batch_size=2, drop_last=True)
opt = fluid.optimizer.SGD(learning_rate=1e-3) opt = fluid.optimizer.Adam(learning_rate=1e-3)
return model, train_reader, opt return model, train_reader, opt
def run_one_loop(self, model, opt, data): def run_one_loop(self, model, opt, data):
......
...@@ -33,9 +33,26 @@ from paddle.fluid.layer_helper import LayerHelper ...@@ -33,9 +33,26 @@ from paddle.fluid.layer_helper import LayerHelper
import math import math
from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
batch_size = 64
momentum_rate = 0.9 momentum_rate = 0.9
l2_decay = 1.2e-4 l2_decay = 1.2e-4
train_parameters = {
"input_size": [3, 224, 224],
"input_mean": [0.485, 0.456, 0.406],
"input_std": [0.229, 0.224, 0.225],
"learning_strategy": {
"name": "cosine_decay",
"batch_size": batch_size,
"epochs": [40, 80, 100],
"steps": [0.1, 0.01, 0.001, 0.0001]
},
"batch_size": batch_size,
"lr": 0.0125,
"total_images": 6149,
"num_epochs": 200
}
def optimizer_setting(params): def optimizer_setting(params):
ls = params["learning_strategy"] ls = params["learning_strategy"]
...@@ -300,11 +317,10 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase): ...@@ -300,11 +317,10 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase):
model = SeResNeXt("se-resnext") model = SeResNeXt("se-resnext")
train_reader = paddle.batch( train_reader = paddle.batch(
paddle.dataset.flowers.test(use_xmap=False), paddle.dataset.flowers.test(use_xmap=False),
batch_size=4, batch_size=train_parameters["batch_size"],
drop_last=True) drop_last=True)
optimizer = optimizer_setting(train_parameters)
opt = fluid.optimizer.SGD(learning_rate=1e-3) return model, train_reader, optimizer
return model, train_reader, opt
def run_one_loop(self, model, opt, data): def run_one_loop(self, model, opt, data):
bs = len(data) bs = len(data)
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import contextlib
import unittest
import numpy as np
from collections import OrderedDict
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid.dygraph.parallel import DataParallel
from paddle.fluid.dygraph.base import to_variable
class MyLayer(fluid.Layer):
def __init__(self, name_scope):
super(MyLayer, self).__init__(name_scope)
def forward(self, inputs):
x = fluid.layers.relu(inputs)
x = fluid.layers.elementwise_mul(x, x)
x = fluid.layers.reduce_sum(x)
return [x]
class TestImperativeParallelCoalesceSplit(unittest.TestCase):
def test_coalesce_split(self):
with fluid.dygraph.guard():
test_layer = MyLayer("test_layer")
strategy = core.ParallelStrategy()
test_layer = DataParallel(test_layer, strategy)
# test variables prepare
vars = []
vars.append(to_variable(np.random.random([2, 3]).astype("float32")))
vars.append(to_variable(np.random.random([4, 9]).astype("float32")))
vars.append(
to_variable(np.random.random([10, 1]).astype("float32")))
var_groups = OrderedDict()
var_groups.setdefault(0, vars)
# record shapes
orig_var_shapes = []
for var in vars:
orig_var_shapes.append(var.shape)
# execute interface
coalesced_vars = test_layer._coalesce_tensors(var_groups)
test_layer._split_tensors(coalesced_vars)
# compare
for orig_var_shape, var in zip(orig_var_shapes, vars):
self.assertEqual(orig_var_shape, var.shape)
def test_reshape_inplace(self):
with fluid.dygraph.guard():
test_layer = MyLayer("test_layer")
strategy = core.ParallelStrategy()
test_layer = DataParallel(test_layer, strategy)
ori_shape = [2, 25]
new_shape = [5, 10]
x_data = np.random.random(ori_shape).astype("float32")
x = to_variable(x_data)
test_layer._reshape_inplace(x, new_shape)
self.assertEqual(x.shape, new_shape)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册