diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 07bab994d354df834d0667c69f307b2d7684fb22..9059ae206207d1feef9e037a635c7a07500f0b25 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -265,6 +265,7 @@ function(cc_test TARGET_NAME) if (${cc_test_SERIAL}) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) endif() endif() endfunction(cc_test) @@ -330,6 +331,7 @@ function(nv_test TARGET_NAME) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) endif() endif() endfunction(nv_test) @@ -577,7 +579,8 @@ function(py_test TARGET_NAME) set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} + COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true + PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif() diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 12e7170fc3da83071f4a23b6c39463d8c2543391..94933d1489c356eb6e9efd6d98bd1cba5ddfcd23 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -949,6 +949,10 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): helper = LayerHelper('dropout', **locals()) out = helper.create_tmp_variable(dtype=x.dtype) mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True) + + if (seed is None or seed == 0) and helper.main_program.random_seed != 0: + seed = helper.main_program.random_seed + helper.append_op( type='dropout', inputs={'X': [x]}, diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index c7a039d2589ef67bd1d3771a2f11084698ba909f..3a314f49ebe5091aa35299ea32ec593026a57c75 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -98,16 +98,13 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( MNIST_RECORDIO_FILE, reader, feeder) - def _init_data(self, random=True): + def _init_data(self): np.random.seed(5) - if random: - img = np.random.random(size=[32, 784]).astype(np.float32) - else: - img = np.ones(shape=[32, 784], dtype='float32') + img = np.random.random(size=[32, 784]).astype(np.float32) label = np.ones(shape=[32, 1], dtype='int64') return img, label - def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True): + def _compare_reduce_and_allreduce(self, model, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return self.check_network_convergence( @@ -115,7 +112,7 @@ class TestMNIST(TestParallelExecutorBase): self.check_network_convergence( model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True) - img, label = self._init_data(random_data) + img, label = self._init_data() all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( model, @@ -166,27 +163,27 @@ class TestMNIST(TestParallelExecutorBase): if use_cuda and not core.is_compiled_with_cuda(): return - img, label = self._init_data(random=False) + img, label = self._init_data() single_first_loss, single_last_loss = self.check_network_convergence( method=simple_fc_net, - seed=1000, + seed=1, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_parallel_executor=False) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, - seed=1000, + seed=1, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_parallel_executor=True) - for p_f in parallel_first_loss: - self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) - for p_l in parallel_last_loss: - self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) + self.assertAlmostEquals( + np.mean(parallel_first_loss), single_first_loss, delta=1e-6) + self.assertAlmostEquals( + np.mean(parallel_last_loss), single_last_loss, delta=1e-6) def test_simple_fc_parallel_accuracy(self): self.check_simple_fc_parallel_accuracy(True) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 834e920845f29b153909a971eb5afc4f8a33346e..b56129a433a9b222f93525ed8fd3013c6f653148 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -21,6 +21,19 @@ from parallel_executor_test_base import TestParallelExecutorBase import unittest import math import os +import numpy as np + +# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor +# and Executor is different. Because, for ParallelExecutor, the dropout_op of +# the neural net will be copied N copies(N is the number of device). This will +# lead to the random numbers generated by ParallelExecutor and Executor are different. +# So, if we compare the loss of ParallelExecutor and Executor, we should remove the +# dropout_op. +remove_dropout = False + +# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor +# and Executor is different. +remove_bn = False def squeeze_excitation(input, num_channels, reduction_ratio): @@ -53,7 +66,8 @@ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, groups=groups, act=None, bias_attr=False) - return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1) + return conv if remove_bn else fluid.layers.batch_norm( + input=conv, act=act, momentum=0.1) def shortcut(input, ch_out, stride): @@ -92,13 +106,14 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): return fluid.layers.elementwise_add(x=short, y=scale, act='relu') -def SE_ResNeXt50Small(batch_size=2, use_feed=False): - assert not use_feed, "SE_ResNeXt doesn't support feed yet" +batch_size = 12 +img_shape = [3, 224, 224] + - img = fluid.layers.fill_constant( - shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) - label = fluid.layers.fill_constant( - shape=[batch_size, 1], dtype='int64', value=0.0) +def SE_ResNeXt50Small(use_feed): + + img = fluid.layers.data(name='image', shape=img_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') conv = conv_bn_layer( input=img, num_filters=16, filter_size=3, stride=2, act='relu') @@ -127,7 +142,8 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): reshape = fluid.layers.reshape( x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) pool = fluid.layers.reduce_mean(input=reshape, dim=2) - dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2) + dropout = pool if remove_dropout else fluid.layers.dropout( + x=pool, dropout_prob=0.2, seed=1) # Classifier layer: prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) @@ -135,75 +151,135 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): return loss -class TestResnet(TestParallelExecutorBase): - def check_resnet_convergence_with_learning_rate_decay(self, - use_cuda=True, - use_reduce=False, - iter=20): +def cosine_decay(learning_rate, step_each_epoch, epochs=120): + """ + Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + """ + global_step = _decay_step_counter() - if use_cuda and not core.is_compiled_with_cuda(): - return + with init_on_cpu(): + epoch = ops.floor(global_step / step_each_epoch) + decayed_lr = learning_rate * \ + (ops.cos(epoch * (math.pi / epochs)) + 1)/2 + return decayed_lr - os.environ['CPU_NUM'] = str(4) - def _cosine_decay(learning_rate, step_each_epoch, epochs=120): - """ - Applies cosine decay to the learning rate. - lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) - """ - global_step = _decay_step_counter() +def optimizer(learning_rate=0.01): + optimizer = fluid.optimizer.Momentum( + learning_rate=cosine_decay( + learning_rate=learning_rate, step_each_epoch=2, epochs=1), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + return optimizer - with init_on_cpu(): - epoch = ops.floor(global_step / step_each_epoch) - decayed_lr = learning_rate * \ - (ops.cos(epoch * (math.pi / epochs)) + 1)/2 - return decayed_lr - def _optimizer(learning_rate=0.01): - optimizer = fluid.optimizer.Momentum( - learning_rate=_cosine_decay( - learning_rate=learning_rate, step_each_epoch=2, epochs=1), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - return optimizer +class TestResnet(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + global remove_dropout + global remove_bn + remove_dropout = False + remove_bn = False + + def _init_data(self, batch_size=2, random=True): + np.random.seed(5) + if random: + img = np.random.random( + size=[batch_size] + img_shape).astype(np.float32) + else: + img = np.ones(shape=[batch_size] + img_shape, dtype='float32') + label = [np.random.randint(0, 999) for _ in range(batch_size)] + label = np.array(label).astype(np.int64).reshape(-1, 1) + return img, label + + def _compare_reduce_and_allreduce(self, + model, + use_cuda, + iter=20, + delta2=1e-4): + if use_cuda and not core.is_compiled_with_cuda(): + return - import functools + global remove_bn + remove_bn = True - batch_size = 2 + img, label = self._init_data(batch_size=batch_size) + all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + iter=iter, + batch_size=batch_size, + use_cuda=use_cuda, + use_reduce=False, + optimizer=optimizer) + reduce_first_loss, reduce_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + iter=iter, + batch_size=batch_size, + use_cuda=use_cuda, + use_reduce=True, + optimizer=optimizer) + + for loss in zip(all_reduce_first_loss, reduce_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + for loss in zip(all_reduce_last_loss, reduce_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=delta2) + + def _check_resnet_convergence(self, + model, + use_cuda=True, + use_reduce=False, + iter=20, + delta2=1e-6): + if use_cuda and not core.is_compiled_with_cuda(): + return + global remove_dropout + global remove_bn + remove_dropout = True + remove_bn = True + + img, label = self._init_data(batch_size=batch_size) single_first_loss, single_last_loss = self.check_network_convergence( - functools.partial( - SE_ResNeXt50Small, batch_size=batch_size), + model, + feed_dict={"image": img, + "label": label}, iter=iter, batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=_optimizer, + optimizer=optimizer, use_parallel_executor=False) - parallel_first_loss, parallel_last_loss = self.check_network_convergence( - functools.partial( - SE_ResNeXt50Small, batch_size=batch_size), + model, + feed_dict={"image": img, + "label": label}, iter=iter, batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=_optimizer) + optimizer=optimizer) - for p_f in parallel_first_loss: - self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) - for p_l in parallel_last_loss: - self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) + self.assertAlmostEquals( + np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) + self.assertAlmostEquals( + np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) def test_seresnext_with_learning_rate_decay(self): - self.check_resnet_convergence_with_learning_rate_decay(True, False) - self.check_resnet_convergence_with_learning_rate_decay( - False, False, iter=5) - - def test_seresnext_with_new_strategy_with_learning_rate_decay(self): - self.check_resnet_convergence_with_learning_rate_decay(True, True) - self.check_resnet_convergence_with_learning_rate_decay( - False, True, iter=5) + self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True) + self._check_resnet_convergence( + model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) + + def test_seresnext_with_new_strategy(self): + # self._compare_reduce_and_allreduce( + # model=SE_ResNeXt50Small, use_cuda=True) + self._compare_reduce_and_allreduce( + model=SE_ResNeXt50Small, use_cuda=False, iter=5, delta2=1e-2) if __name__ == '__main__':