diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc index aad5677b970452c29354ff8da9b137c448cae8f2..89e8d3e1410f6a7aded7ec797c269a6d53f3fe58 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cc +++ b/paddle/fluid/operators/optimizers/sgd_op.cc @@ -40,8 +40,15 @@ class SGDOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, "Learning rate should have 1 element"); auto param_dim = ctx->GetInputDim("Param"); - // TODO(qijun): check dimensions of Param and Grad at compile - // and runtime. + if (ctx->GetInputsVarType("Grad")[0] == + framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + platform::errors::InvalidArgument( + "SGD Operator's input Param and Grad dimensions do not match. " + "The Param shape is [%s], but the Grad shape is [%s].", + param_dim, ctx->GetInputDim("Grad"))); + } ctx->SetOutputDim("ParamOut", param_dim); } diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 6f68cc4e1c00e705f1f74a4254499b81160ad0cd..9ed278bf517390dc901c8d2ee33790aceec3a4be 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -184,6 +184,15 @@ class DataParallel(layers.Layer): [coalesced_grad, grad_vars, g_var_shapes]) return coalesced_grads_and_grad_vars + def _reshape_inplace(self, x, shape): + x_shape = self._helper.create_variable_for_type_inference(dtype=x.dtype) + self._helper.append_op( + type="reshape2", + inputs={'X': x}, + attrs={'shape': shape}, + outputs={'Out': x, + 'XShape': x_shape}) + def _split_tensors(self, coalesced_grads_and_grad_vars): from ..layers import nn for coalesced_grad, origin_grad_vars, grad_shapes in coalesced_grads_and_grad_vars: @@ -195,7 +204,8 @@ class DataParallel(layers.Layer): attrs={'sections': grad_var_len, 'axis': 0}) for g_var, g_shape in zip(origin_grad_vars, grad_shapes): - nn.reshape(x=g_var, shape=g_shape, inplace=True) + self._reshape_inplace(x=g_var, shape=g_shape) + assert g_var.shape == g_shape @no_grad def apply_collective_grads(self): diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py index 3890236013c8a29288acde08198dd05abaeb6620..3f96ca98f6a720bcdac3befd506ef02c2f06a894 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py @@ -114,7 +114,7 @@ class TestMnist(TestParallelDyGraphRunnerBase): model = MNIST("mnist") train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=2, drop_last=True) - opt = fluid.optimizer.SGD(learning_rate=1e-3) + opt = fluid.optimizer.Adam(learning_rate=1e-3) return model, train_reader, opt def run_one_loop(self, model, opt, data): diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py index bdf5b483812fb72c47794be72cfcbb57f3dea0c3..5cf8ad8b32ab2e58d7de391a1c0861a1d7838cb7 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py @@ -33,9 +33,26 @@ from paddle.fluid.layer_helper import LayerHelper import math from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase +batch_size = 64 momentum_rate = 0.9 l2_decay = 1.2e-4 +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "cosine_decay", + "batch_size": batch_size, + "epochs": [40, 80, 100], + "steps": [0.1, 0.01, 0.001, 0.0001] + }, + "batch_size": batch_size, + "lr": 0.0125, + "total_images": 6149, + "num_epochs": 200 +} + def optimizer_setting(params): ls = params["learning_strategy"] @@ -300,11 +317,10 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase): model = SeResNeXt("se-resnext") train_reader = paddle.batch( paddle.dataset.flowers.test(use_xmap=False), - batch_size=4, + batch_size=train_parameters["batch_size"], drop_last=True) - - opt = fluid.optimizer.SGD(learning_rate=1e-3) - return model, train_reader, opt + optimizer = optimizer_setting(train_parameters) + return model, train_reader, optimizer def run_one_loop(self, model, opt, data): bs = len(data) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py b/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py new file mode 100644 index 0000000000000000000000000000000000000000..e5c32d0003835712bd226812bfae3dbd88577825 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_parallel_coalesce_split.py @@ -0,0 +1,82 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +from collections import OrderedDict + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.dygraph.parallel import DataParallel +from paddle.fluid.dygraph.base import to_variable + + +class MyLayer(fluid.Layer): + def __init__(self, name_scope): + super(MyLayer, self).__init__(name_scope) + + def forward(self, inputs): + x = fluid.layers.relu(inputs) + x = fluid.layers.elementwise_mul(x, x) + x = fluid.layers.reduce_sum(x) + return [x] + + +class TestImperativeParallelCoalesceSplit(unittest.TestCase): + def test_coalesce_split(self): + with fluid.dygraph.guard(): + test_layer = MyLayer("test_layer") + strategy = core.ParallelStrategy() + test_layer = DataParallel(test_layer, strategy) + + # test variables prepare + vars = [] + vars.append(to_variable(np.random.random([2, 3]).astype("float32"))) + vars.append(to_variable(np.random.random([4, 9]).astype("float32"))) + vars.append( + to_variable(np.random.random([10, 1]).astype("float32"))) + var_groups = OrderedDict() + var_groups.setdefault(0, vars) + + # record shapes + orig_var_shapes = [] + for var in vars: + orig_var_shapes.append(var.shape) + + # execute interface + coalesced_vars = test_layer._coalesce_tensors(var_groups) + test_layer._split_tensors(coalesced_vars) + + # compare + for orig_var_shape, var in zip(orig_var_shapes, vars): + self.assertEqual(orig_var_shape, var.shape) + + def test_reshape_inplace(self): + with fluid.dygraph.guard(): + test_layer = MyLayer("test_layer") + strategy = core.ParallelStrategy() + test_layer = DataParallel(test_layer, strategy) + + ori_shape = [2, 25] + new_shape = [5, 10] + x_data = np.random.random(ori_shape).astype("float32") + x = to_variable(x_data) + test_layer._reshape_inplace(x, new_shape) + self.assertEqual(x.shape, new_shape) + + +if __name__ == '__main__': + unittest.main()