diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index ca354a63c67bb8f63bad6af7445258858da9b806..ea97aa5fb22a47782a56d73947330fd1a5c260f1 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -31,7 +31,13 @@ std::string OpHandleBase::DebugString() const { return ss.str(); } -OpHandleBase::~OpHandleBase() {} +OpHandleBase::~OpHandleBase() { +#ifdef PADDLE_WITH_CUDA + for (auto &ev : events_) { + cudaEventDestroy(ev.second); + } +#endif +} void OpHandleBase::Run(bool use_event) { #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index e0209fce76b1459fd39c146da4a5e610554d3804..a853da6fba7aa0812d6c8f9f44540329578eff6b 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -21,7 +21,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { for (auto &var_map : graph->vars_) { for (auto &name_pair : var_map) { if (name_pair.second.size() <= 1) { - return; + continue; } auto it_new = name_pair.second.rbegin(); auto it_old = name_pair.second.rbegin(); diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore index ad02bdecf436bba925e2e3b7efb20c878df70dfd..51b1da4c84ad1b98b3ee7ceb372394e6f3724086 100644 --- a/python/paddle/fluid/tests/unittests/.gitignore +++ b/python/paddle/fluid/tests/unittests/.gitignore @@ -2,3 +2,4 @@ mnist.recordio mnist_0.recordio mnist_1.recordio mnist_2.recordio +flowers.recordio diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index dd6e70eadbd83603ac093810d74fd1853195891f..d5d2275e4d90524bdf054d8d79395be0c592bb88 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -16,6 +16,7 @@ import unittest import paddle.fluid as fluid import paddle.v2 as paddle import paddle.v2.dataset.mnist as mnist +import paddle.v2.dataset.flowers as flowers import numpy @@ -64,6 +65,119 @@ def fc_with_batchnorm(): return loss +def squeeze_excitation(input, num_channels, reduction_ratio): + # pool = fluid.layers.pool2d( + # input=input, pool_size=0, pool_type='avg', global_pooling=True) + conv = input + shape = conv.shape + reshape = fluid.layers.reshape( + x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) + pool = fluid.layers.reduce_mean(input=reshape, dim=2) + + squeeze = fluid.layers.fc(input=pool, + size=num_channels / reduction_ratio, + act='relu') + excitation = fluid.layers.fc(input=squeeze, + size=num_channels, + act='sigmoid') + scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) + return scale + + +def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) / 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1) + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out: + if stride == 1: + filter_size = 1 + else: + filter_size = 3 + return conv_bn_layer(input, ch_out, filter_size, stride) + else: + return input + + +def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): + # The number of first 1x1 convolutional channels for each bottleneck build block + # was halved to reduce the compution cost. + conv0 = conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = conv_bn_layer( + input=conv0, + num_filters=num_filters * 2, + filter_size=3, + stride=stride, + groups=cardinality, + act='relu') + conv2 = conv_bn_layer( + input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) + scale = squeeze_excitation( + input=conv2, + num_channels=num_filters * 2, + reduction_ratio=reduction_ratio) + + short = shortcut(input, num_filters * 2, stride) + + return fluid.layers.elementwise_add(x=short, y=scale, act='relu') + + +def SE_ResNeXt152(): + reader = fluid.layers.open_recordio_file( + filename='./flowers.recordio', + shapes=[[-1, 3, 224, 224], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + + img, label = fluid.layers.read_file(reader) + + conv = conv_bn_layer( + input=img, num_filters=64, filter_size=3, stride=2, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=64, filter_size=3, stride=1, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=128, filter_size=3, stride=1, act='relu') + conv = fluid.layers.pool2d( + input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') + + cardinality = 64 + reduction_ratio = 16 + depth = [3, 8, 36, 3] + num_filters = [128, 256, 512, 1024] + + for block in range(len(depth)): + for i in range(depth[block]): + conv = bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio) + + shape = conv.shape + reshape = fluid.layers.reshape( + x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) + pool = fluid.layers.reduce_mean(input=reshape, dim=2) + dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2) + # Classifier layer: + prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + class ParallelExecutor(unittest.TestCase): @classmethod def setUpClass(cls): @@ -81,24 +195,40 @@ class ParallelExecutor(unittest.TestCase): fluid.recordio_writer.convert_reader_to_recordio_file( './mnist.recordio', reader, feeder) + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(flowers.train(), batch_size=4) + feeder = fluid.DataFeeder( + feed_list=[ + fluid.layers.data( + name='image', shape=[3, 224, 224]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + "./flowers.recordio", reader, feeder) + def test_simple_fc(self): self.check_network_convergence(simple_fc_net) def test_batchnorm_fc(self): self.check_network_convergence(fc_with_batchnorm) - def check_network_convergence(self, method): + def check_network_convergence(self, method, memory_opt=True, iter=10): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): loss = method() adam = fluid.optimizer.Adam() adam.minimize(loss) + if memory_opt: + fluid.memory_optimize(main) + exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) first_loss, = exe.run([loss.name]) first_loss = numpy.array(first_loss) - for i in xrange(10): + for i in xrange(iter): exe.run([]) last_loss, = exe.run([loss.name]) @@ -106,3 +236,6 @@ class ParallelExecutor(unittest.TestCase): print first_loss, last_loss self.assertGreater(first_loss[0], last_loss[0]) + + def test_resnet(self): + self.check_network_convergence(SE_ResNeXt152, iter=20)