test_parallel_executor.py 33.8 KB
Newer Older
Y
Yu Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

C
chengduoZH 已提交
15
import numpy as np
Y
Yu Yang 已提交
16
import unittest
Y
Yu Yang 已提交
17

Y
Yu Yang 已提交
18
import paddle.fluid as fluid
19 20 21
import paddle
import paddle.dataset.mnist as mnist
import paddle.dataset.wmt16 as wmt16
Y
Yu Yang 已提交
22

Y
yuyang18 已提交
23 24 25
MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio"

Y
Yu Yang 已提交
26

X
Xin Pan 已提交
27 28 29 30 31
def simple_fc_net(use_feed):
    if use_feed:
        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    else:
J
JiayiFeng 已提交
32
        reader = fluid.layers.open_files(
Y
yuyang18 已提交
33
            filenames=[MNIST_RECORDIO_FILE],
X
Xin Pan 已提交
34 35
            shapes=[[-1, 784], [-1, 1]],
            lod_levels=[0, 0],
J
JiayiFeng 已提交
36 37 38 39
            dtypes=['float32', 'int64'],
            thread_num=1,
            for_parallel=True)
        reader = fluid.layers.io.double_buffer(reader)
X
Xin Pan 已提交
40
        img, label = fluid.layers.read_file(reader)
41 42 43 44 45 46 47 48 49 50 51 52 53 54
    hidden = img
    for _ in xrange(4):
        hidden = fluid.layers.fc(
            hidden,
            size=200,
            act='tanh',
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=1.0)))
    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
    loss = fluid.layers.mean(loss)
    return loss


X
Xin Pan 已提交
55 56 57 58 59
def fc_with_batchnorm(use_feed):
    if use_feed:
        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    else:
J
JiayiFeng 已提交
60
        reader = fluid.layers.open_files(
Y
yuyang18 已提交
61
            filenames=[MNIST_RECORDIO_FILE],
X
Xin Pan 已提交
62 63
            shapes=[[-1, 784], [-1, 1]],
            lod_levels=[0, 0],
J
JiayiFeng 已提交
64 65 66 67
            dtypes=['float32', 'int64'],
            thread_num=1,
            for_parallel=True)
        reader = fluid.layers.io.double_buffer(reader)
X
Xin Pan 已提交
68 69
        img, label = fluid.layers.read_file(reader)

70
    hidden = img
Y
Yu Yang 已提交
71
    for _ in xrange(1):
72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
        hidden = fluid.layers.fc(
            hidden,
            size=200,
            act='tanh',
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=1.0)))

        hidden = fluid.layers.batch_norm(input=hidden)

    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
    loss = fluid.layers.mean(loss)
    return loss


Y
Yu Yang 已提交
87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
def squeeze_excitation(input, num_channels, reduction_ratio):
    # pool = fluid.layers.pool2d(
    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
    conv = input
    shape = conv.shape
    reshape = fluid.layers.reshape(
        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
    pool = fluid.layers.reduce_mean(input=reshape, dim=2)

    squeeze = fluid.layers.fc(input=pool,
                              size=num_channels / reduction_ratio,
                              act='relu')
    excitation = fluid.layers.fc(input=squeeze,
                                 size=num_channels,
                                 act='sigmoid')
    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
    return scale


def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
                  act=None):
    conv = fluid.layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        stride=stride,
        padding=(filter_size - 1) / 2,
        groups=groups,
        act=None,
        bias_attr=False)
    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)


def shortcut(input, ch_out, stride):
    ch_in = input.shape[1]
    if ch_in != ch_out:
        if stride == 1:
            filter_size = 1
        else:
            filter_size = 3
        return conv_bn_layer(input, ch_out, filter_size, stride)
    else:
        return input


def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
    # The number of first 1x1 convolutional channels for each bottleneck build block
    # was halved to reduce the compution cost.
    conv0 = conv_bn_layer(
        input=input, num_filters=num_filters, filter_size=1, act='relu')
    conv1 = conv_bn_layer(
        input=conv0,
        num_filters=num_filters * 2,
        filter_size=3,
        stride=stride,
        groups=cardinality,
        act='relu')
    conv2 = conv_bn_layer(
        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
    scale = squeeze_excitation(
        input=conv2,
        num_channels=num_filters * 2,
        reduction_ratio=reduction_ratio)

    short = shortcut(input, num_filters * 2, stride)

    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')


X
Xin Pan 已提交
156
def SE_ResNeXt50Small(batch_size=2, use_feed=False):
X
Xin Pan 已提交
157 158
    assert not use_feed, "SE_ResNeXt doesn't support feed yet"

Y
Yu Yang 已提交
159 160 161 162
    img = fluid.layers.fill_constant(
        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
    label = fluid.layers.fill_constant(
        shape=[batch_size, 1], dtype='int64', value=0.0)
Y
Yu Yang 已提交
163 164

    conv = conv_bn_layer(
165
        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
Y
Yu Yang 已提交
166
    conv = conv_bn_layer(
167
        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
Y
Yu Yang 已提交
168
    conv = conv_bn_layer(
169
        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
Y
Yu Yang 已提交
170 171 172
    conv = fluid.layers.pool2d(
        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')

X
Xin Pan 已提交
173
    cardinality = 32
Y
Yu Yang 已提交
174
    reduction_ratio = 16
X
Xin Pan 已提交
175
    depth = [3, 4, 6, 3]
Y
Yu Yang 已提交
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
    num_filters = [128, 256, 512, 1024]

    for block in range(len(depth)):
        for i in range(depth[block]):
            conv = bottleneck_block(
                input=conv,
                num_filters=num_filters[block],
                stride=2 if i == 0 and block != 0 else 1,
                cardinality=cardinality,
                reduction_ratio=reduction_ratio)

    shape = conv.shape
    reshape = fluid.layers.reshape(
        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
    # Classifier layer:
    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
    loss = fluid.layers.mean(loss)
    return loss


Y
Yu Yang 已提交
199 200 201
import time


Y
Yu Yang 已提交
202
class TestParallelExecutorBase(unittest.TestCase):
Y
Yu Yang 已提交
203 204 205
    def check_network_convergence(self,
                                  method,
                                  memory_opt=True,
J
JiayiFeng 已提交
206
                                  iter=50,
X
Xin Pan 已提交
207
                                  batch_size=None,
X
Xin Pan 已提交
208
                                  allow_op_delay=False,
209
                                  feed_dict=None,
J
stash  
JiayiFeng 已提交
210
                                  seed=None,
C
chengduoZH 已提交
211 212
                                  use_parallel_executor=True,
                                  balance_parameter_opt_between_cards=False):
J
JiayiFeng 已提交
213 214 215 216 217 218 219 220 221 222 223
        def run_executor(exe, feed, fetch_list, program=None):
            if isinstance(exe, fluid.ParallelExecutor):
                res = exe.run(fetch_list=fetch_list, feed=feed)
            elif isinstance(exe, fluid.Executor):
                if program is None:
                    program = fluid.default_main_program()
                res = exe.run(program=program, feed=feed, fetch_list=fetch_list)
            else:
                raise ValueError('Unkown type exe')
            return res

Y
Yu Yang 已提交
224 225
        main = fluid.Program()
        startup = fluid.Program()
Y
Yu Yang 已提交
226
        startup.random_seed = 1  # Fix random seed
Y
Yu Yang 已提交
227
        with fluid.program_guard(main, startup):
F
fengjiayi 已提交
228
            if seed is not None:
J
stash  
JiayiFeng 已提交
229
                startup.random_seed = seed
Y
Yu Yang 已提交
230
            loss = method(use_feed=feed_dict is not None)
Y
Yu Yang 已提交
231 232 233 234
            adam = fluid.optimizer.Adam()
            adam.minimize(loss)
            if memory_opt:
                fluid.memory_optimize(main)
235 236 237
            place = fluid.CUDAPlace(0)
            startup_exe = fluid.Executor(place)
            startup_exe.run(startup)
Y
yuyang18 已提交
238 239
            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.allow_op_delay = allow_op_delay
Y
yuyang18 已提交
240 241 242 243

            build_strategy = fluid.BuildStrategy()
            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce

F
fengjiayi 已提交
244
            if use_parallel_executor:
245
                exe = fluid.ParallelExecutor(
C
chengduoZH 已提交
246 247
                    True,
                    loss_name=loss.name,
Y
yuyang18 已提交
248 249
                    exec_strategy=exec_strategy,
                    build_strategy=build_strategy)
F
fengjiayi 已提交
250 251 252
            else:
                exe = fluid.Executor(place=place)

Y
Yu Yang 已提交
253 254 255
            if batch_size is not None:
                batch_size *= fluid.core.get_cuda_device_count()
            begin = time.time()
J
JiayiFeng 已提交
256 257
            first_loss, = run_executor(
                exe=exe, feed=feed_dict, fetch_list=[loss.name])
C
chengduoZH 已提交
258
            first_loss = np.array(first_loss)
Y
Yu Yang 已提交
259 260

            for i in xrange(iter):
J
JiayiFeng 已提交
261
                run_executor(exe=exe, feed=feed_dict, fetch_list=[])
Y
Yu Yang 已提交
262

J
JiayiFeng 已提交
263 264
            last_loss, = run_executor(
                exe=exe, feed=feed_dict, fetch_list=[loss.name])
Y
Yu Yang 已提交
265 266 267 268 269 270
            end = time.time()

            if batch_size is not None:
                print "%.4f Instance per second" % (
                    (batch_size * iter + 2) / (end - begin))

C
chengduoZH 已提交
271
            last_loss = np.array(last_loss)
Y
Yu Yang 已提交
272 273

            print first_loss, last_loss
Y
Yu Yang 已提交
274
            # self.assertGreater(first_loss[0], last_loss[0])
F
fengjiayi 已提交
275
            return first_loss, last_loss
Y
Yu Yang 已提交
276 277 278


class TestMNIST(TestParallelExecutorBase):
279 280
    @classmethod
    def setUpClass(cls):
Y
Stash  
Yu Yang 已提交
281 282
        # Convert mnist to recordio file
        with fluid.program_guard(fluid.Program(), fluid.Program()):
283
            reader = paddle.batch(mnist.train(), batch_size=4)
Y
Stash  
Yu Yang 已提交
284 285 286 287 288 289 290 291 292
            feeder = fluid.DataFeeder(
                feed_list=[  # order is image and label
                    fluid.layers.data(
                        name='image', shape=[784]),
                    fluid.layers.data(
                        name='label', shape=[1], dtype='int64'),
                ],
                place=fluid.CPUPlace())
            fluid.recordio_writer.convert_reader_to_recordio_file(
Y
yuyang18 已提交
293
                MNIST_RECORDIO_FILE, reader, feeder)
Y
Stash  
Yu Yang 已提交
294

C
chengduoZH 已提交
295
    def check_simple_fc_convergence(self, balance_parameter_opt_between_cards):
Y
Yu Yang 已提交
296
        self.check_network_convergence(simple_fc_net)
X
Xin Pan 已提交
297
        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
Y
Yu Yang 已提交
298

C
chengduoZH 已提交
299 300
        img = np.zeros(shape=[32, 784], dtype='float32')
        label = np.ones(shape=[32, 1], dtype='int64')
X
Xin Pan 已提交
301
        self.check_network_convergence(
C
chengduoZH 已提交
302 303 304 305 306
            simple_fc_net,
            feed_dict={"image": img,
                       "label": label},
            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
        )
C
chengduoZH 已提交
307

308
    def test_simple_fc(self):
C
chengduoZH 已提交
309
        self.check_simple_fc_convergence(False)
X
Xin Pan 已提交
310

C
chengduoZH 已提交
311 312 313 314 315
    def test_simple_fc_with_new_strategy(self):
        self.check_simple_fc_convergence(True)

    def check_simple_fc_parallel_accuracy(self,
                                          balance_parameter_opt_between_cards):
C
chengduoZH 已提交
316 317
        img = np.zeros(shape=[32, 784], dtype='float32')
        label = np.ones(shape=[32, 1], dtype='int64')
J
JiayiFeng 已提交
318 319 320 321 322 323 324 325 326 327 328
        single_first_loss, single_last_loss = self.check_network_convergence(
            method=simple_fc_net,
            seed=1000,
            feed_dict={"image": img,
                       "label": label},
            use_parallel_executor=False)
        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
            method=simple_fc_net,
            seed=1000,
            feed_dict={"image": img,
                       "label": label},
C
chengduoZH 已提交
329 330 331
            use_parallel_executor=True,
            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
        )
J
JiayiFeng 已提交
332 333 334 335 336

        for p_f in parallel_first_loss:
            self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6)
        for p_l in parallel_last_loss:
            self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6)
F
fengjiayi 已提交
337

338
    def test_simple_fc_parallel_accuracy(self):
C
chengduoZH 已提交
339 340 341 342
        self.check_simple_fc_parallel_accuracy(False)

    def test_simple_fc_parallel_accuracy_with_new_strategy(self):
        self.check_simple_fc_parallel_accuracy(True)
C
chengduoZH 已提交
343

C
chengduoZH 已提交
344 345
    def check_batchnorm_fc_convergence(self,
                                       balance_parameter_opt_between_cards):
Y
Yu Yang 已提交
346
        self.check_network_convergence(fc_with_batchnorm)
C
chengduoZH 已提交
347 348
        img = np.zeros(shape=[32, 784], dtype='float32')
        label = np.ones(shape=[32, 1], dtype='int64')
X
Xin Pan 已提交
349
        self.check_network_convergence(
C
chengduoZH 已提交
350 351 352 353 354
            fc_with_batchnorm,
            feed_dict={"image": img,
                       "label": label},
            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
        )
C
chengduoZH 已提交
355

356
    def test_batchnorm_fc(self):
C
chengduoZH 已提交
357 358 359 360
        self.check_batchnorm_fc_convergence(False)

    def test_batchnorm_fc_with_new_strategy(self):
        self.check_batchnorm_fc_convergence(True)
Y
Yu Yang 已提交
361 362 363


class TestResnet(TestParallelExecutorBase):
Y
Yu Yang 已提交
364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
    # @classmethod
    # def setUpClass(cls):
    #     # import os
    #     # if os.path.exists('./flowers.recordio'):
    #     #     return
    #     with fluid.program_guard(fluid.Program(), fluid.Program()):
    #         reader = paddle.batch(flowers.train(), batch_size=4)
    #         feeder = fluid.DataFeeder(
    #             feed_list=[
    #                 fluid.layers.data(
    #                     name='image', shape=[3, 224, 224]),
    #                 fluid.layers.data(
    #                     name='label', shape=[1], dtype='int64'),
    #             ],
    #             place=fluid.CPUPlace())
    #         fluid.recordio_writer.convert_reader_to_recordio_file(
    #             "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress)
Y
Yu Yang 已提交
381

C
chengduoZH 已提交
382
    def check_resnet_convergence(self, balance_parameter_opt_between_cards):
Y
Yu Yang 已提交
383
        import functools
384
        batch_size = 2
Y
Yu Yang 已提交
385 386
        self.check_network_convergence(
            functools.partial(
X
Xin Pan 已提交
387
                SE_ResNeXt50Small, batch_size=batch_size),
Y
Yu Yang 已提交
388
            iter=20,
C
chengduoZH 已提交
389 390 391
            batch_size=batch_size,
            balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
        )
C
chengduoZH 已提交
392

393
    def test_resnet(self):
C
chengduoZH 已提交
394 395 396 397
        self.check_resnet_convergence(False)

    def test_resnet_with_new_strategy(self):
        self.check_resnet_convergence(True)
Y
Yu Yang 已提交
398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520


class ModelHyperParams(object):
    # Dictionary size for source and target language. This model directly uses
    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
    # alreay been added, but the <pad> token is not added. Transformer requires
    # sequences in a mini-batch are padded to have the same length. A <pad> token is
    # added into the original dictionary in paddle.dateset.wmt16.

    # size of source word dictionary.
    src_vocab_size = 10000
    # index for <pad> token in source language.
    src_pad_idx = src_vocab_size

    # size of target word dictionay
    trg_vocab_size = 10000
    # index for <pad> token in target language.
    trg_pad_idx = trg_vocab_size

    # position value corresponding to the <pad> token.
    pos_pad_idx = 0

    # max length of sequences. It should plus 1 to include position
    # padding token for position encoding.
    max_length = 50

    # the dimension for word embeddings, which is also the last dimension of
    # the input and output of multi-head attention, position-wise feed-forward
    # networks, encoder and decoder.

    d_model = 512
    # size of the hidden layer in position-wise feed-forward networks.
    d_inner_hid = 1024
    # the dimension that keys are projected to for dot-product attention.
    d_key = 64
    # the dimension that values are projected to for dot-product attention.
    d_value = 64
    # number of head used in multi-head attention.
    n_head = 8
    # number of sub-layers to be stacked in the encoder and decoder.
    n_layer = 6
    # dropout rate used by all dropout layers.
    dropout = 0.1


def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
    """
    Pad the instances to the max sequence length in batch, and generate the
    corresponding position data and attention bias. Then, convert the numpy
    data to tensors and return a dict mapping names to tensors.
    """

    def __pad_batch_data(insts,
                         pad_idx,
                         is_target=False,
                         return_pos=True,
                         return_attn_bias=True,
                         return_max_len=True):
        """
        Pad the instances to the max sequence length in batch, and generate the
        corresponding position data and attention bias.
        """
        return_list = []
        max_len = max(len(inst) for inst in insts)
        inst_data = np.array(
            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
        return_list += [inst_data.astype("int64").reshape([-1, 1])]
        if return_pos:
            inst_pos = np.array([[
                pos_i + 1 if w_i != pad_idx else 0
                for pos_i, w_i in enumerate(inst)
            ] for inst in inst_data])

            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
        if return_attn_bias:
            if is_target:
                # This is used to avoid attention on paddings and subsequent
                # words.
                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
                                              max_len))
                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
                    [-1, 1, max_len, max_len])
                slf_attn_bias_data = np.tile(slf_attn_bias_data,
                                             [1, n_head, 1, 1]) * [-1e9]
            else:
                # This is used to avoid attention on paddings.
                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
                                               (max_len - len(inst))
                                               for inst in insts])
                slf_attn_bias_data = np.tile(
                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
                    [1, n_head, max_len, 1])
            return_list += [slf_attn_bias_data.astype("float32")]
        if return_max_len:
            return_list += [max_len]
        return return_list if len(return_list) > 1 else return_list[0]

    def data_to_tensor(data_list, name_list, input_dict, place):
        assert len(data_list) == len(name_list)
        for i in range(len(name_list)):
            tensor = fluid.LoDTensor()
            tensor.set(data_list[i], place)
            input_dict[name_list[i]] = tensor

    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
        [inst[0] for inst in insts], src_pad_idx, is_target=False)
    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
                                [1, 1, trg_max_len, 1]).astype("float32")
    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
                                False, False, False)
    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])

    return [
        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
    ]


import transformer_model


X
Xin Pan 已提交
521 522
def transformer(use_feed):
    assert not use_feed, "transfomer doesn't support feed yet"
Y
Yu Yang 已提交
523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541
    return transformer_model.transformer(
        ModelHyperParams.src_vocab_size + 1,
        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
        ModelHyperParams.n_layer, ModelHyperParams.n_head,
        ModelHyperParams.d_key, ModelHyperParams.d_value,
        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)


class TestTransformer(TestParallelExecutorBase):
    @classmethod
    def setUpClass(cls):
        reader = paddle.batch(
            wmt16.train(ModelHyperParams.src_vocab_size,
                        ModelHyperParams.trg_vocab_size),
            batch_size=transformer_model.batch_size)

        with fluid.recordio_writer.create_recordio_writer(
Y
yuyang18 已提交
542
                WMT16_RECORDIO_FILE) as writer:
Y
Yu Yang 已提交
543 544 545 546 547 548 549 550 551
            for batch in reader():
                for tensor in prepare_batch_input(
                        batch, ModelHyperParams.src_pad_idx,
                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
                    t = fluid.LoDTensor()
                    t.set(tensor, fluid.CPUPlace())
                    writer.append_tensor(t)
                writer.complete_append_tensor()

Y
Yu Yang 已提交
552
    @unittest.skip("transformer is buggy in multi gpu")
Y
Yu Yang 已提交
553 554
    def test_main(self):
        self.check_network_convergence(transformer)
555 556 557


class ParallelExecutorTestingDuringTraining(unittest.TestCase):
Y
yuyang18 已提交
558
    def check_network_convergence(self, build_strategy=None):
559 560 561 562 563 564
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
            loss = simple_fc_net(True)
            test_program = main.clone(for_test=True)

D
Dang Qingqing 已提交
565
            opt = fluid.optimizer.SGD(learning_rate=0.001)
566 567 568
            opt.minimize(loss)

            batch_size = 32
C
chengduoZH 已提交
569 570
            image = np.random.normal(size=(batch_size, 784)).astype('float32')
            label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
571 572 573 574 575 576 577

            place = fluid.CUDAPlace(0)
            exe = fluid.Executor(place)
            exe.run(startup)
            feed_dict = {'image': image, 'label': label}

            train_exe = fluid.ParallelExecutor(
C
chengduoZH 已提交
578 579 580
                use_cuda=True,
                loss_name=loss.name,
                main_program=main,
Y
yuyang18 已提交
581
                build_strategy=build_strategy)
582 583 584 585

            test_exe = fluid.ParallelExecutor(
                use_cuda=True,
                main_program=test_program,
C
chengduoZH 已提交
586
                share_vars_from=train_exe,
Y
yuyang18 已提交
587
                build_strategy=build_strategy)
588 589

            for i in xrange(5):
J
stash  
JiayiFeng 已提交
590
                test_loss, = test_exe.run([loss.name], feed=feed_dict)
C
chengduoZH 已提交
591
                test_loss = np.array(test_loss)
592

J
stash  
JiayiFeng 已提交
593
                train_loss, = train_exe.run([loss.name], feed=feed_dict)
C
chengduoZH 已提交
594
                train_loss = np.array(train_loss)
D
Dang Qingqing 已提交
595
                self.assertTrue(
C
chengduoZH 已提交
596
                    np.allclose(
D
Dang Qingqing 已提交
597 598 599
                        train_loss, test_loss, atol=1e-8),
                    "Train loss: " + str(train_loss) + "\n Test loss:" +
                    str(test_loss))
Y
Yu Yang 已提交
600

C
chengduoZH 已提交
601
    def test_parallel_testing(self):
Y
yuyang18 已提交
602 603 604
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
        self.check_network_convergence(build_strategy)
C
chengduoZH 已提交
605 606

    def test_parallel_testing_with_new_strategy(self):
Y
yuyang18 已提交
607 608 609
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        self.check_network_convergence(build_strategy)
C
chengduoZH 已提交
610

Y
Yu Yang 已提交
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628

import paddle.dataset.conll05 as conll05
import paddle.fluid as fluid

word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_dict_len = len(verb_dict)
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
mix_hidden_lr = 1e-3
embedding_name = 'emb'


def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
Y
yuyang18 已提交
629
            is_sparse, **ignored):
Y
Yu Yang 已提交
630 631 632
    # 8 features
    predicate_embedding = fluid.layers.embedding(
        input=predicate,
C
chengduoZH 已提交
633
        is_sparse=is_sparse,
Y
Yu Yang 已提交
634 635 636 637 638
        size=[pred_dict_len, word_dim],
        dtype='float32',
        param_attr='vemb')

    mark_embedding = fluid.layers.embedding(
C
chengduoZH 已提交
639 640 641 642
        input=mark,
        is_sparse=is_sparse,
        size=[mark_dict_len, mark_dim],
        dtype='float32')
Y
Yu Yang 已提交
643 644 645 646 647

    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
    emb_layers = [
        fluid.layers.embedding(
            size=[word_dict_len, word_dim],
C
chengduoZH 已提交
648
            is_sparse=is_sparse,
Y
Yu Yang 已提交
649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697
            input=x,
            param_attr=fluid.ParamAttr(
                name=embedding_name, trainable=False)) for x in word_input
    ]
    emb_layers.append(predicate_embedding)
    emb_layers.append(mark_embedding)

    hidden_0_layers = [
        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
        for emb in emb_layers
    ]

    hidden_0 = fluid.layers.sums(input=hidden_0_layers)

    lstm_0 = fluid.layers.dynamic_lstm(
        input=hidden_0,
        size=hidden_dim,
        candidate_activation='relu',
        gate_activation='sigmoid',
        cell_activation='sigmoid')

    # stack L-LSTM and R-LSTM with direct edges
    input_tmp = [hidden_0, lstm_0]

    for i in range(1, depth):
        mix_hidden = fluid.layers.sums(input=[
            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
        ])

        lstm = fluid.layers.dynamic_lstm(
            input=mix_hidden,
            size=hidden_dim,
            candidate_activation='relu',
            gate_activation='sigmoid',
            cell_activation='sigmoid',
            is_reverse=((i % 2) == 1))

        input_tmp = [mix_hidden, lstm]

    feature_out = fluid.layers.sums(input=[
        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
    ])

    return feature_out


class TestCRFModel(unittest.TestCase):
Y
yuyang18 已提交
698
    def check_network_convergence(self, is_sparse, build_strategy=None):
Y
Yu Yang 已提交
699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
            word = fluid.layers.data(
                name='word_data', shape=[1], dtype='int64', lod_level=1)
            predicate = fluid.layers.data(
                name='verb_data', shape=[1], dtype='int64', lod_level=1)
            ctx_n2 = fluid.layers.data(
                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
            ctx_n1 = fluid.layers.data(
                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
            ctx_0 = fluid.layers.data(
                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
            ctx_p1 = fluid.layers.data(
                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
            ctx_p2 = fluid.layers.data(
                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
            mark = fluid.layers.data(
                name='mark_data', shape=[1], dtype='int64', lod_level=1)
C
chengduoZH 已提交
718

Y
Yu Yang 已提交
719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745
            feature_out = db_lstm(**locals())
            target = fluid.layers.data(
                name='target', shape=[1], dtype='int64', lod_level=1)
            crf_cost = fluid.layers.linear_chain_crf(
                input=feature_out,
                label=target,
                param_attr=fluid.ParamAttr(
                    name='crfw', learning_rate=1e-1))
            avg_cost = fluid.layers.mean(crf_cost)

            sgd_optimizer = fluid.optimizer.SGD(
                learning_rate=fluid.layers.exponential_decay(
                    learning_rate=0.01,
                    decay_steps=100000,
                    decay_rate=0.5,
                    staircase=True))
            sgd_optimizer.minimize(avg_cost)

            train_data = paddle.batch(
                paddle.reader.shuffle(
                    paddle.dataset.conll05.test(), buf_size=8192),
                batch_size=16)

            place = fluid.CUDAPlace(0)
            exe = fluid.Executor(place)
            exe.run(startup)

C
chengduoZH 已提交
746 747 748
            pe = fluid.ParallelExecutor(
                use_cuda=True,
                loss_name=avg_cost.name,
Y
yuyang18 已提交
749
                build_strategy=build_strategy)
Y
Yu Yang 已提交
750 751 752 753 754 755 756 757 758 759 760

            feeder = fluid.DataFeeder(
                feed_list=[
                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
                    mark, target
                ],
                place=fluid.CPUPlace())

            data = train_data()
            for i in xrange(10):
                cur_batch = next(data)
C
chengduoZH 已提交
761
                print map(np.array,
J
stash  
JiayiFeng 已提交
762
                          pe.run(feed=feeder.feed(cur_batch),
Y
Yu Yang 已提交
763
                                 fetch_list=[avg_cost.name]))[0]
C
chengduoZH 已提交
764

Y
yuyang18 已提交
765 766 767 768 769
    def test_update_sparse_parameter_all_reduce(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
        self.check_network_convergence(
            is_sparse=True, build_strategy=build_strategy)
C
chengduoZH 已提交
770

Y
yuyang18 已提交
771 772 773 774 775
    def test_update_dense_parameter_all_reduce(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
        self.check_network_convergence(
            is_sparse=False, build_strategy=build_strategy)
C
chengduoZH 已提交
776

Y
yuyang18 已提交
777 778 779
    def test_update_sparse_parameter_reduce(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
C
chengduoZH 已提交
780
        self.check_network_convergence(
Y
yuyang18 已提交
781
            is_sparse=False, build_strategy=build_strategy)
C
chengduoZH 已提交
782

Y
yuyang18 已提交
783 784 785
    def test_update_dense_parameter_reduce(self):
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
C
chengduoZH 已提交
786
        self.check_network_convergence(
Y
yuyang18 已提交
787
            is_sparse=False, build_strategy=build_strategy)
C
chengduoZH 已提交
788

C
chengduoZH 已提交
789

C
chengduoZH 已提交
790
# test fetch all the variables of global_block
C
chengduoZH 已提交
791 792

import paddle.dataset.flowers as flowers
C
chengduoZH 已提交
793
import math
C
chengduoZH 已提交
794 795


C
chengduoZH 已提交
796
def Lenet(data, class_dim):
C
chengduoZH 已提交
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818
    conv1 = fluid.layers.conv2d(data, 32, 5, 1, act=None)
    bn1 = fluid.layers.batch_norm(conv1, act='relu')
    pool1 = fluid.layers.pool2d(bn1, 2, 'max', 2)
    conv2 = fluid.layers.conv2d(pool1, 50, 5, 1, act=None)
    bn2 = fluid.layers.batch_norm(conv2, act='relu')
    pool2 = fluid.layers.pool2d(bn2, 2, 'max', 2)

    fc1 = fluid.layers.fc(pool2, size=500, act='relu')
    fc2 = fluid.layers.fc(fc1, size=class_dim, act='softmax')

    return fc2


class TestFetchOp(unittest.TestCase):
    def parallel_exe(self, train_inputs, seed):
        main = fluid.Program()
        startup = fluid.Program()
        startup.random_seed = seed
        with fluid.program_guard(main, startup):
            data = fluid.layers.data(
                name='image', shape=[3, 224, 224], dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
819
            out = Lenet(data, class_dim=102)
C
chengduoZH 已提交
820 821 822 823 824 825 826 827 828 829 830
            loss = fluid.layers.cross_entropy(input=out, label=label)
            loss = fluid.layers.mean(loss)

            opt = fluid.optimizer.Momentum(
                learning_rate=0.1,
                momentum=0.9,
                regularization=fluid.regularizer.L2Decay(1e-4))

            opt.minimize(loss)

            # TODO(zcd): I found that onece the memory optimizer is open,
C
chengduoZH 已提交
831 832
            # parallel_exe doesn't fetch some variable, such as conv2d_0.b_0@GRAD,
            # conv2d_1.b_0@GRAD. Those variables should not be pruned.
C
chengduoZH 已提交
833 834 835 836 837 838 839 840 841 842 843
            # fluid.memory_optimize(main)

            place = fluid.CUDAPlace(0)
            exe = fluid.Executor(place)
            exe.run(startup)

            feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
            pe = fluid.ParallelExecutor(
                use_cuda=True, loss_name=loss.name, main_program=main)

            fetch_list = []
C
chengduoZH 已提交
844 845
            all_vars = main.global_block().vars
            for k, v in all_vars.iteritems():
C
chengduoZH 已提交
846
                if 'tmp' not in k and k[0] is not '_' or v.persistable:
C
chengduoZH 已提交
847
                    fetch_list.append(k)
C
chengduoZH 已提交
848

C
chengduoZH 已提交
849
            for data in train_inputs:
C
chengduoZH 已提交
850 851
                ret = pe.run(fetch_list, feed=feeder.feed(data))
                for i in range(len(fetch_list)):
C
chengduoZH 已提交
852 853
                    assert not math.isnan(np.sum(ret[i])) and \
                           not math.isinf(np.sum(ret[i]))
C
chengduoZH 已提交
854

Y
yuyang18 已提交
855 856
    @unittest.skip("this test is buggy")
    def test_feed(self):
C
chengduoZH 已提交
857 858 859
        tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
        tst_reader_iter = tst_reader()

C
chengduoZH 已提交
860
        iters = 3
C
chengduoZH 已提交
861 862 863 864
        train_inputs = []
        for i in range(iters):
            train_inputs.append(tst_reader_iter.next())

C
chengduoZH 已提交
865
        self.parallel_exe(train_inputs, seed=1)
C
chengduoZH 已提交
866 867


Y
yuyang18 已提交
868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904
class TestFeedParallel(unittest.TestCase):
    def test_main(self):
        main = fluid.Program()
        startup = fluid.Program()
        startup.random_seed = 1
        with fluid.scope_guard(fluid.core.Scope()):
            with fluid.program_guard(main, startup):
                data = fluid.layers.data(
                    name='image', shape=[3, 224, 224], dtype='float32')
                label = fluid.layers.data(
                    name='label', shape=[1], dtype='int64')
                out = Lenet(data, class_dim=102)
                loss = fluid.layers.cross_entropy(input=out, label=label)
                loss = fluid.layers.mean(loss)
                opt = fluid.optimizer.Momentum(
                    learning_rate=0.1,
                    momentum=0.9,
                    regularization=fluid.regularizer.L2Decay(1e-4))

                opt.minimize(loss)
        place = fluid.CUDAPlace(0)
        feeder = fluid.DataFeeder(place=place, feed_list=[data, label])
        reader = feeder.decorate_reader(
            paddle.batch(
                flowers.train(), batch_size=16), multi_devices=True)
        exe = fluid.Executor(place)
        exe.run(startup)
        pe = fluid.ParallelExecutor(
            use_cuda=True, loss_name=loss.name, main_program=main)

        for batch_id, data in enumerate(reader()):
            loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0])
            print batch_id, loss_np
            if batch_id == 2:
                break


C
chengduoZH 已提交
905 906
if __name__ == '__main__':
    unittest.main()