test_parallel_executor.py 17.0 KB
Newer Older
Y
Yu Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Y
Yu Yang 已提交
15
import numpy
Y
Yu Yang 已提交
16
import unittest
Y
Yu Yang 已提交
17

Y
Yu Yang 已提交
18
import paddle.fluid as fluid
19 20 21
import paddle
import paddle.dataset.mnist as mnist
import paddle.dataset.wmt16 as wmt16
Y
Yu Yang 已提交
22 23


X
Xin Pan 已提交
24 25 26 27 28 29 30 31 32 33 34
def simple_fc_net(use_feed):
    if use_feed:
        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    else:
        reader = fluid.layers.open_recordio_file(
            filename='./mnist.recordio',
            shapes=[[-1, 784], [-1, 1]],
            lod_levels=[0, 0],
            dtypes=['float32', 'int64'])
        img, label = fluid.layers.read_file(reader)
35 36 37 38 39 40 41 42 43 44 45 46 47 48
    hidden = img
    for _ in xrange(4):
        hidden = fluid.layers.fc(
            hidden,
            size=200,
            act='tanh',
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=1.0)))
    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
    loss = fluid.layers.mean(loss)
    return loss


X
Xin Pan 已提交
49 50 51 52 53 54 55 56 57 58 59 60
def fc_with_batchnorm(use_feed):
    if use_feed:
        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    else:
        reader = fluid.layers.open_recordio_file(
            filename='./mnist.recordio',
            shapes=[[-1, 784], [-1, 1]],
            lod_levels=[0, 0],
            dtypes=['float32', 'int64'])
        img, label = fluid.layers.read_file(reader)

61
    hidden = img
Y
Yu Yang 已提交
62
    for _ in xrange(1):
63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
        hidden = fluid.layers.fc(
            hidden,
            size=200,
            act='tanh',
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=1.0)))

        hidden = fluid.layers.batch_norm(input=hidden)

    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
    loss = fluid.layers.mean(loss)
    return loss


Y
Yu Yang 已提交
78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
def squeeze_excitation(input, num_channels, reduction_ratio):
    # pool = fluid.layers.pool2d(
    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
    conv = input
    shape = conv.shape
    reshape = fluid.layers.reshape(
        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
    pool = fluid.layers.reduce_mean(input=reshape, dim=2)

    squeeze = fluid.layers.fc(input=pool,
                              size=num_channels / reduction_ratio,
                              act='relu')
    excitation = fluid.layers.fc(input=squeeze,
                                 size=num_channels,
                                 act='sigmoid')
    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
    return scale


def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
                  act=None):
    conv = fluid.layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        stride=stride,
        padding=(filter_size - 1) / 2,
        groups=groups,
        act=None,
        bias_attr=False)
    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)


def shortcut(input, ch_out, stride):
    ch_in = input.shape[1]
    if ch_in != ch_out:
        if stride == 1:
            filter_size = 1
        else:
            filter_size = 3
        return conv_bn_layer(input, ch_out, filter_size, stride)
    else:
        return input


def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
    # The number of first 1x1 convolutional channels for each bottleneck build block
    # was halved to reduce the compution cost.
    conv0 = conv_bn_layer(
        input=input, num_filters=num_filters, filter_size=1, act='relu')
    conv1 = conv_bn_layer(
        input=conv0,
        num_filters=num_filters * 2,
        filter_size=3,
        stride=stride,
        groups=cardinality,
        act='relu')
    conv2 = conv_bn_layer(
        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
    scale = squeeze_excitation(
        input=conv2,
        num_channels=num_filters * 2,
        reduction_ratio=reduction_ratio)

    short = shortcut(input, num_filters * 2, stride)

    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')


X
Xin Pan 已提交
147 148 149
def SE_ResNeXt152Small(batch_size=2, use_feed=False):
    assert not use_feed, "SE_ResNeXt doesn't support feed yet"

Y
Yu Yang 已提交
150 151 152 153
    img = fluid.layers.fill_constant(
        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
    label = fluid.layers.fill_constant(
        shape=[batch_size, 1], dtype='int64', value=0.0)
Y
Yu Yang 已提交
154 155

    conv = conv_bn_layer(
156
        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
Y
Yu Yang 已提交
157
    conv = conv_bn_layer(
158
        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
Y
Yu Yang 已提交
159
    conv = conv_bn_layer(
160
        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
Y
Yu Yang 已提交
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189
    conv = fluid.layers.pool2d(
        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')

    cardinality = 64
    reduction_ratio = 16
    depth = [3, 8, 36, 3]
    num_filters = [128, 256, 512, 1024]

    for block in range(len(depth)):
        for i in range(depth[block]):
            conv = bottleneck_block(
                input=conv,
                num_filters=num_filters[block],
                stride=2 if i == 0 and block != 0 else 1,
                cardinality=cardinality,
                reduction_ratio=reduction_ratio)

    shape = conv.shape
    reshape = fluid.layers.reshape(
        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
    # Classifier layer:
    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
    loss = fluid.layers.mean(loss)
    return loss


Y
Yu Yang 已提交
190 191 192
import time


Y
Yu Yang 已提交
193
class TestParallelExecutorBase(unittest.TestCase):
Y
Yu Yang 已提交
194 195 196 197
    def check_network_convergence(self,
                                  method,
                                  memory_opt=True,
                                  iter=10,
X
Xin Pan 已提交
198
                                  batch_size=None,
X
Xin Pan 已提交
199 200
                                  allow_op_delay=False,
                                  feed_dict={}):
Y
Yu Yang 已提交
201 202 203
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
X
Xin Pan 已提交
204
            loss = method(use_feed=len(feed_dict) > 0)
Y
Yu Yang 已提交
205 206 207 208 209
            adam = fluid.optimizer.Adam()
            adam.minimize(loss)
            if memory_opt:
                fluid.memory_optimize(main)

X
Xin Pan 已提交
210
            exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True)
Y
Yu Yang 已提交
211 212 213
            if batch_size is not None:
                batch_size *= fluid.core.get_cuda_device_count()
            begin = time.time()
X
Xin Pan 已提交
214
            first_loss, = exe.run([loss.name], feed_dict=feed_dict)
Y
Yu Yang 已提交
215 216 217
            first_loss = numpy.array(first_loss)

            for i in xrange(iter):
X
Xin Pan 已提交
218
                exe.run([], feed_dict=feed_dict)
Y
Yu Yang 已提交
219

X
Xin Pan 已提交
220
            last_loss, = exe.run([loss.name], feed_dict=feed_dict)
Y
Yu Yang 已提交
221 222 223 224 225 226
            end = time.time()

            if batch_size is not None:
                print "%.4f Instance per second" % (
                    (batch_size * iter + 2) / (end - begin))

Y
Yu Yang 已提交
227 228 229
            last_loss = numpy.array(last_loss)

            print first_loss, last_loss
Y
Yu Yang 已提交
230
            # self.assertGreater(first_loss[0], last_loss[0])
Y
Yu Yang 已提交
231 232 233


class TestMNIST(TestParallelExecutorBase):
234 235
    @classmethod
    def setUpClass(cls):
Y
Stash  
Yu Yang 已提交
236 237
        # Convert mnist to recordio file
        with fluid.program_guard(fluid.Program(), fluid.Program()):
238
            reader = paddle.batch(mnist.train(), batch_size=4)
Y
Stash  
Yu Yang 已提交
239 240 241 242 243 244 245 246 247 248 249
            feeder = fluid.DataFeeder(
                feed_list=[  # order is image and label
                    fluid.layers.data(
                        name='image', shape=[784]),
                    fluid.layers.data(
                        name='label', shape=[1], dtype='int64'),
                ],
                place=fluid.CPUPlace())
            fluid.recordio_writer.convert_reader_to_recordio_file(
                './mnist.recordio', reader, feeder)

Y
Yu Yang 已提交
250 251
    def test_simple_fc(self):
        self.check_network_convergence(simple_fc_net)
X
Xin Pan 已提交
252
        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
Y
Yu Yang 已提交
253

X
Xin Pan 已提交
254 255 256 257 258 259
        img = numpy.zeros(shape=[32, 784], dtype='float32')
        label = numpy.ones(shape=[32, 1], dtype='int64')
        self.check_network_convergence(
            simple_fc_net, feed_dict={"image": img,
                                      "label": label})

Y
Yu Yang 已提交
260 261
    def test_batchnorm_fc(self):
        self.check_network_convergence(fc_with_batchnorm)
X
Xin Pan 已提交
262 263 264 265 266
        img = numpy.zeros(shape=[32, 784], dtype='float32')
        label = numpy.ones(shape=[32, 1], dtype='int64')
        self.check_network_convergence(
            fc_with_batchnorm, feed_dict={"image": img,
                                          "label": label})
Y
Yu Yang 已提交
267 268 269


class TestResnet(TestParallelExecutorBase):
Y
Yu Yang 已提交
270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286
    # @classmethod
    # def setUpClass(cls):
    #     # import os
    #     # if os.path.exists('./flowers.recordio'):
    #     #     return
    #     with fluid.program_guard(fluid.Program(), fluid.Program()):
    #         reader = paddle.batch(flowers.train(), batch_size=4)
    #         feeder = fluid.DataFeeder(
    #             feed_list=[
    #                 fluid.layers.data(
    #                     name='image', shape=[3, 224, 224]),
    #                 fluid.layers.data(
    #                     name='label', shape=[1], dtype='int64'),
    #             ],
    #             place=fluid.CPUPlace())
    #         fluid.recordio_writer.convert_reader_to_recordio_file(
    #             "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress)
Y
Yu Yang 已提交
287 288

    def test_resnet(self):
Y
Yu Yang 已提交
289
        import functools
290
        batch_size = 2
Y
Yu Yang 已提交
291 292
        self.check_network_convergence(
            functools.partial(
293
                SE_ResNeXt152Small, batch_size=batch_size),
Y
Yu Yang 已提交
294 295
            iter=20,
            batch_size=batch_size)
Y
Yu Yang 已提交
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421


class ModelHyperParams(object):
    # Dictionary size for source and target language. This model directly uses
    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
    # alreay been added, but the <pad> token is not added. Transformer requires
    # sequences in a mini-batch are padded to have the same length. A <pad> token is
    # added into the original dictionary in paddle.dateset.wmt16.

    # size of source word dictionary.
    src_vocab_size = 10000
    # index for <pad> token in source language.
    src_pad_idx = src_vocab_size

    # size of target word dictionay
    trg_vocab_size = 10000
    # index for <pad> token in target language.
    trg_pad_idx = trg_vocab_size

    # position value corresponding to the <pad> token.
    pos_pad_idx = 0

    # max length of sequences. It should plus 1 to include position
    # padding token for position encoding.
    max_length = 50

    # the dimension for word embeddings, which is also the last dimension of
    # the input and output of multi-head attention, position-wise feed-forward
    # networks, encoder and decoder.

    d_model = 512
    # size of the hidden layer in position-wise feed-forward networks.
    d_inner_hid = 1024
    # the dimension that keys are projected to for dot-product attention.
    d_key = 64
    # the dimension that values are projected to for dot-product attention.
    d_value = 64
    # number of head used in multi-head attention.
    n_head = 8
    # number of sub-layers to be stacked in the encoder and decoder.
    n_layer = 6
    # dropout rate used by all dropout layers.
    dropout = 0.1


import numpy as np


def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
    """
    Pad the instances to the max sequence length in batch, and generate the
    corresponding position data and attention bias. Then, convert the numpy
    data to tensors and return a dict mapping names to tensors.
    """

    def __pad_batch_data(insts,
                         pad_idx,
                         is_target=False,
                         return_pos=True,
                         return_attn_bias=True,
                         return_max_len=True):
        """
        Pad the instances to the max sequence length in batch, and generate the
        corresponding position data and attention bias.
        """
        return_list = []
        max_len = max(len(inst) for inst in insts)
        inst_data = np.array(
            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
        return_list += [inst_data.astype("int64").reshape([-1, 1])]
        if return_pos:
            inst_pos = np.array([[
                pos_i + 1 if w_i != pad_idx else 0
                for pos_i, w_i in enumerate(inst)
            ] for inst in inst_data])

            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
        if return_attn_bias:
            if is_target:
                # This is used to avoid attention on paddings and subsequent
                # words.
                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
                                              max_len))
                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
                    [-1, 1, max_len, max_len])
                slf_attn_bias_data = np.tile(slf_attn_bias_data,
                                             [1, n_head, 1, 1]) * [-1e9]
            else:
                # This is used to avoid attention on paddings.
                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
                                               (max_len - len(inst))
                                               for inst in insts])
                slf_attn_bias_data = np.tile(
                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
                    [1, n_head, max_len, 1])
            return_list += [slf_attn_bias_data.astype("float32")]
        if return_max_len:
            return_list += [max_len]
        return return_list if len(return_list) > 1 else return_list[0]

    def data_to_tensor(data_list, name_list, input_dict, place):
        assert len(data_list) == len(name_list)
        for i in range(len(name_list)):
            tensor = fluid.LoDTensor()
            tensor.set(data_list[i], place)
            input_dict[name_list[i]] = tensor

    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
        [inst[0] for inst in insts], src_pad_idx, is_target=False)
    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
                                [1, 1, trg_max_len, 1]).astype("float32")
    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
                                False, False, False)
    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])

    return [
        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
    ]


import transformer_model


X
Xin Pan 已提交
422 423
def transformer(use_feed):
    assert not use_feed, "transfomer doesn't support feed yet"
Y
Yu Yang 已提交
424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
    return transformer_model.transformer(
        ModelHyperParams.src_vocab_size + 1,
        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
        ModelHyperParams.n_layer, ModelHyperParams.n_head,
        ModelHyperParams.d_key, ModelHyperParams.d_value,
        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)


class TestTransformer(TestParallelExecutorBase):
    @classmethod
    def setUpClass(cls):
        reader = paddle.batch(
            wmt16.train(ModelHyperParams.src_vocab_size,
                        ModelHyperParams.trg_vocab_size),
            batch_size=transformer_model.batch_size)

        with fluid.recordio_writer.create_recordio_writer(
                "./wmt16.recordio") as writer:
            for batch in reader():
                for tensor in prepare_batch_input(
                        batch, ModelHyperParams.src_pad_idx,
                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
                    t = fluid.LoDTensor()
                    t.set(tensor, fluid.CPUPlace())
                    writer.append_tensor(t)
                writer.complete_append_tensor()

Y
Yu Yang 已提交
453
    @unittest.skip("transformer is buggy in multi gpu")
Y
Yu Yang 已提交
454 455
    def test_main(self):
        self.check_network_convergence(transformer)