test_parallel_executor.py 23.9 KB
Newer Older
Y
Yu Yang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Y
Yu Yang 已提交
15
import numpy
Y
Yu Yang 已提交
16
import unittest
Y
Yu Yang 已提交
17

Y
Yu Yang 已提交
18
import paddle.fluid as fluid
19 20 21
import paddle
import paddle.dataset.mnist as mnist
import paddle.dataset.wmt16 as wmt16
Y
Yu Yang 已提交
22 23


X
Xin Pan 已提交
24 25 26 27 28
def simple_fc_net(use_feed):
    if use_feed:
        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    else:
J
JiayiFeng 已提交
29 30
        reader = fluid.layers.open_files(
            filenames=['./mnist.recordio'],
X
Xin Pan 已提交
31 32
            shapes=[[-1, 784], [-1, 1]],
            lod_levels=[0, 0],
J
JiayiFeng 已提交
33 34 35 36
            dtypes=['float32', 'int64'],
            thread_num=1,
            for_parallel=True)
        reader = fluid.layers.io.double_buffer(reader)
X
Xin Pan 已提交
37
        img, label = fluid.layers.read_file(reader)
38 39 40 41 42 43 44 45 46 47 48 49 50 51
    hidden = img
    for _ in xrange(4):
        hidden = fluid.layers.fc(
            hidden,
            size=200,
            act='tanh',
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=1.0)))
    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
    loss = fluid.layers.mean(loss)
    return loss


X
Xin Pan 已提交
52 53 54 55 56
def fc_with_batchnorm(use_feed):
    if use_feed:
        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    else:
J
JiayiFeng 已提交
57 58
        reader = fluid.layers.open_files(
            filenames=['mnist.recordio'],
X
Xin Pan 已提交
59 60
            shapes=[[-1, 784], [-1, 1]],
            lod_levels=[0, 0],
J
JiayiFeng 已提交
61 62 63 64
            dtypes=['float32', 'int64'],
            thread_num=1,
            for_parallel=True)
        reader = fluid.layers.io.double_buffer(reader)
X
Xin Pan 已提交
65 66
        img, label = fluid.layers.read_file(reader)

67
    hidden = img
Y
Yu Yang 已提交
68
    for _ in xrange(1):
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
        hidden = fluid.layers.fc(
            hidden,
            size=200,
            act='tanh',
            bias_attr=fluid.ParamAttr(
                initializer=fluid.initializer.Constant(value=1.0)))

        hidden = fluid.layers.batch_norm(input=hidden)

    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
    loss = fluid.layers.mean(loss)
    return loss


Y
Yu Yang 已提交
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
def squeeze_excitation(input, num_channels, reduction_ratio):
    # pool = fluid.layers.pool2d(
    #    input=input, pool_size=0, pool_type='avg', global_pooling=True)
    conv = input
    shape = conv.shape
    reshape = fluid.layers.reshape(
        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
    pool = fluid.layers.reduce_mean(input=reshape, dim=2)

    squeeze = fluid.layers.fc(input=pool,
                              size=num_channels / reduction_ratio,
                              act='relu')
    excitation = fluid.layers.fc(input=squeeze,
                                 size=num_channels,
                                 act='sigmoid')
    scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0)
    return scale


def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
                  act=None):
    conv = fluid.layers.conv2d(
        input=input,
        num_filters=num_filters,
        filter_size=filter_size,
        stride=stride,
        padding=(filter_size - 1) / 2,
        groups=groups,
        act=None,
        bias_attr=False)
    return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1)


def shortcut(input, ch_out, stride):
    ch_in = input.shape[1]
    if ch_in != ch_out:
        if stride == 1:
            filter_size = 1
        else:
            filter_size = 3
        return conv_bn_layer(input, ch_out, filter_size, stride)
    else:
        return input


def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
    # The number of first 1x1 convolutional channels for each bottleneck build block
    # was halved to reduce the compution cost.
    conv0 = conv_bn_layer(
        input=input, num_filters=num_filters, filter_size=1, act='relu')
    conv1 = conv_bn_layer(
        input=conv0,
        num_filters=num_filters * 2,
        filter_size=3,
        stride=stride,
        groups=cardinality,
        act='relu')
    conv2 = conv_bn_layer(
        input=conv1, num_filters=num_filters * 2, filter_size=1, act=None)
    scale = squeeze_excitation(
        input=conv2,
        num_channels=num_filters * 2,
        reduction_ratio=reduction_ratio)

    short = shortcut(input, num_filters * 2, stride)

    return fluid.layers.elementwise_add(x=short, y=scale, act='relu')


X
Xin Pan 已提交
153
def SE_ResNeXt50Small(batch_size=2, use_feed=False):
X
Xin Pan 已提交
154 155
    assert not use_feed, "SE_ResNeXt doesn't support feed yet"

Y
Yu Yang 已提交
156 157 158 159
    img = fluid.layers.fill_constant(
        shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0)
    label = fluid.layers.fill_constant(
        shape=[batch_size, 1], dtype='int64', value=0.0)
Y
Yu Yang 已提交
160 161

    conv = conv_bn_layer(
162
        input=img, num_filters=16, filter_size=3, stride=2, act='relu')
Y
Yu Yang 已提交
163
    conv = conv_bn_layer(
164
        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
Y
Yu Yang 已提交
165
    conv = conv_bn_layer(
166
        input=conv, num_filters=16, filter_size=3, stride=1, act='relu')
Y
Yu Yang 已提交
167 168 169
    conv = fluid.layers.pool2d(
        input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')

X
Xin Pan 已提交
170
    cardinality = 32
Y
Yu Yang 已提交
171
    reduction_ratio = 16
X
Xin Pan 已提交
172
    depth = [3, 4, 6, 3]
Y
Yu Yang 已提交
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
    num_filters = [128, 256, 512, 1024]

    for block in range(len(depth)):
        for i in range(depth[block]):
            conv = bottleneck_block(
                input=conv,
                num_filters=num_filters[block],
                stride=2 if i == 0 and block != 0 else 1,
                cardinality=cardinality,
                reduction_ratio=reduction_ratio)

    shape = conv.shape
    reshape = fluid.layers.reshape(
        x=conv, shape=[-1, shape[1], shape[2] * shape[3]])
    pool = fluid.layers.reduce_mean(input=reshape, dim=2)
    dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2)
    # Classifier layer:
    prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax')
    loss = fluid.layers.cross_entropy(input=prediction, label=label)
    loss = fluid.layers.mean(loss)
    return loss


Y
Yu Yang 已提交
196 197 198
import time


Y
Yu Yang 已提交
199
class TestParallelExecutorBase(unittest.TestCase):
Y
Yu Yang 已提交
200 201 202 203
    def check_network_convergence(self,
                                  method,
                                  memory_opt=True,
                                  iter=10,
X
Xin Pan 已提交
204
                                  batch_size=None,
X
Xin Pan 已提交
205 206
                                  allow_op_delay=False,
                                  feed_dict={}):
Y
Yu Yang 已提交
207 208
        main = fluid.Program()
        startup = fluid.Program()
Y
Yu Yang 已提交
209
        startup.random_seed = 1  # Fix random seed
Y
Yu Yang 已提交
210
        with fluid.program_guard(main, startup):
X
Xin Pan 已提交
211
            loss = method(use_feed=len(feed_dict) > 0)
Y
Yu Yang 已提交
212 213 214 215
            adam = fluid.optimizer.Adam()
            adam.minimize(loss)
            if memory_opt:
                fluid.memory_optimize(main)
216 217 218 219
            place = fluid.CUDAPlace(0)
            startup_exe = fluid.Executor(place)
            startup_exe.run(startup)

Y
Yu Yang 已提交
220 221
            exe = fluid.ParallelExecutor(
                True, loss_name=loss.name, allow_op_delay=allow_op_delay)
Y
Yu Yang 已提交
222 223 224
            if batch_size is not None:
                batch_size *= fluid.core.get_cuda_device_count()
            begin = time.time()
X
Xin Pan 已提交
225
            first_loss, = exe.run([loss.name], feed_dict=feed_dict)
Y
Yu Yang 已提交
226 227 228
            first_loss = numpy.array(first_loss)

            for i in xrange(iter):
X
Xin Pan 已提交
229
                exe.run([], feed_dict=feed_dict)
Y
Yu Yang 已提交
230

X
Xin Pan 已提交
231
            last_loss, = exe.run([loss.name], feed_dict=feed_dict)
Y
Yu Yang 已提交
232 233 234 235 236 237
            end = time.time()

            if batch_size is not None:
                print "%.4f Instance per second" % (
                    (batch_size * iter + 2) / (end - begin))

Y
Yu Yang 已提交
238 239 240
            last_loss = numpy.array(last_loss)

            print first_loss, last_loss
Y
Yu Yang 已提交
241
            # self.assertGreater(first_loss[0], last_loss[0])
Y
Yu Yang 已提交
242 243 244


class TestMNIST(TestParallelExecutorBase):
245 246
    @classmethod
    def setUpClass(cls):
Y
Stash  
Yu Yang 已提交
247 248
        # Convert mnist to recordio file
        with fluid.program_guard(fluid.Program(), fluid.Program()):
249
            reader = paddle.batch(mnist.train(), batch_size=4)
Y
Stash  
Yu Yang 已提交
250 251 252 253 254 255 256 257 258 259 260
            feeder = fluid.DataFeeder(
                feed_list=[  # order is image and label
                    fluid.layers.data(
                        name='image', shape=[784]),
                    fluid.layers.data(
                        name='label', shape=[1], dtype='int64'),
                ],
                place=fluid.CPUPlace())
            fluid.recordio_writer.convert_reader_to_recordio_file(
                './mnist.recordio', reader, feeder)

Y
Yu Yang 已提交
261 262
    def test_simple_fc(self):
        self.check_network_convergence(simple_fc_net)
X
Xin Pan 已提交
263
        self.check_network_convergence(simple_fc_net, allow_op_delay=True)
Y
Yu Yang 已提交
264

X
Xin Pan 已提交
265 266 267 268 269 270
        img = numpy.zeros(shape=[32, 784], dtype='float32')
        label = numpy.ones(shape=[32, 1], dtype='int64')
        self.check_network_convergence(
            simple_fc_net, feed_dict={"image": img,
                                      "label": label})

Y
Yu Yang 已提交
271 272
    def test_batchnorm_fc(self):
        self.check_network_convergence(fc_with_batchnorm)
X
Xin Pan 已提交
273 274 275 276 277
        img = numpy.zeros(shape=[32, 784], dtype='float32')
        label = numpy.ones(shape=[32, 1], dtype='int64')
        self.check_network_convergence(
            fc_with_batchnorm, feed_dict={"image": img,
                                          "label": label})
Y
Yu Yang 已提交
278 279 280


class TestResnet(TestParallelExecutorBase):
Y
Yu Yang 已提交
281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
    # @classmethod
    # def setUpClass(cls):
    #     # import os
    #     # if os.path.exists('./flowers.recordio'):
    #     #     return
    #     with fluid.program_guard(fluid.Program(), fluid.Program()):
    #         reader = paddle.batch(flowers.train(), batch_size=4)
    #         feeder = fluid.DataFeeder(
    #             feed_list=[
    #                 fluid.layers.data(
    #                     name='image', shape=[3, 224, 224]),
    #                 fluid.layers.data(
    #                     name='label', shape=[1], dtype='int64'),
    #             ],
    #             place=fluid.CPUPlace())
    #         fluid.recordio_writer.convert_reader_to_recordio_file(
    #             "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress)
Y
Yu Yang 已提交
298 299

    def test_resnet(self):
Y
Yu Yang 已提交
300
        import functools
301
        batch_size = 2
Y
Yu Yang 已提交
302 303
        self.check_network_convergence(
            functools.partial(
X
Xin Pan 已提交
304
                SE_ResNeXt50Small, batch_size=batch_size),
Y
Yu Yang 已提交
305 306
            iter=20,
            batch_size=batch_size)
Y
Yu Yang 已提交
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432


class ModelHyperParams(object):
    # Dictionary size for source and target language. This model directly uses
    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
    # alreay been added, but the <pad> token is not added. Transformer requires
    # sequences in a mini-batch are padded to have the same length. A <pad> token is
    # added into the original dictionary in paddle.dateset.wmt16.

    # size of source word dictionary.
    src_vocab_size = 10000
    # index for <pad> token in source language.
    src_pad_idx = src_vocab_size

    # size of target word dictionay
    trg_vocab_size = 10000
    # index for <pad> token in target language.
    trg_pad_idx = trg_vocab_size

    # position value corresponding to the <pad> token.
    pos_pad_idx = 0

    # max length of sequences. It should plus 1 to include position
    # padding token for position encoding.
    max_length = 50

    # the dimension for word embeddings, which is also the last dimension of
    # the input and output of multi-head attention, position-wise feed-forward
    # networks, encoder and decoder.

    d_model = 512
    # size of the hidden layer in position-wise feed-forward networks.
    d_inner_hid = 1024
    # the dimension that keys are projected to for dot-product attention.
    d_key = 64
    # the dimension that values are projected to for dot-product attention.
    d_value = 64
    # number of head used in multi-head attention.
    n_head = 8
    # number of sub-layers to be stacked in the encoder and decoder.
    n_layer = 6
    # dropout rate used by all dropout layers.
    dropout = 0.1


import numpy as np


def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head):
    """
    Pad the instances to the max sequence length in batch, and generate the
    corresponding position data and attention bias. Then, convert the numpy
    data to tensors and return a dict mapping names to tensors.
    """

    def __pad_batch_data(insts,
                         pad_idx,
                         is_target=False,
                         return_pos=True,
                         return_attn_bias=True,
                         return_max_len=True):
        """
        Pad the instances to the max sequence length in batch, and generate the
        corresponding position data and attention bias.
        """
        return_list = []
        max_len = max(len(inst) for inst in insts)
        inst_data = np.array(
            [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
        return_list += [inst_data.astype("int64").reshape([-1, 1])]
        if return_pos:
            inst_pos = np.array([[
                pos_i + 1 if w_i != pad_idx else 0
                for pos_i, w_i in enumerate(inst)
            ] for inst in inst_data])

            return_list += [inst_pos.astype("int64").reshape([-1, 1])]
        if return_attn_bias:
            if is_target:
                # This is used to avoid attention on paddings and subsequent
                # words.
                slf_attn_bias_data = np.ones((inst_data.shape[0], max_len,
                                              max_len))
                slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape(
                    [-1, 1, max_len, max_len])
                slf_attn_bias_data = np.tile(slf_attn_bias_data,
                                             [1, n_head, 1, 1]) * [-1e9]
            else:
                # This is used to avoid attention on paddings.
                slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
                                               (max_len - len(inst))
                                               for inst in insts])
                slf_attn_bias_data = np.tile(
                    slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
                    [1, n_head, max_len, 1])
            return_list += [slf_attn_bias_data.astype("float32")]
        if return_max_len:
            return_list += [max_len]
        return return_list if len(return_list) > 1 else return_list[0]

    def data_to_tensor(data_list, name_list, input_dict, place):
        assert len(data_list) == len(name_list)
        for i in range(len(name_list)):
            tensor = fluid.LoDTensor()
            tensor.set(data_list[i], place)
            input_dict[name_list[i]] = tensor

    src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data(
        [inst[0] for inst in insts], src_pad_idx, is_target=False)
    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data(
        [inst[1] for inst in insts], trg_pad_idx, is_target=True)
    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
                                [1, 1, trg_max_len, 1]).astype("float32")
    lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False,
                                False, False, False)
    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])

    return [
        src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias,
        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
    ]


import transformer_model


X
Xin Pan 已提交
433 434
def transformer(use_feed):
    assert not use_feed, "transfomer doesn't support feed yet"
Y
Yu Yang 已提交
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463
    return transformer_model.transformer(
        ModelHyperParams.src_vocab_size + 1,
        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
        ModelHyperParams.n_layer, ModelHyperParams.n_head,
        ModelHyperParams.d_key, ModelHyperParams.d_value,
        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)


class TestTransformer(TestParallelExecutorBase):
    @classmethod
    def setUpClass(cls):
        reader = paddle.batch(
            wmt16.train(ModelHyperParams.src_vocab_size,
                        ModelHyperParams.trg_vocab_size),
            batch_size=transformer_model.batch_size)

        with fluid.recordio_writer.create_recordio_writer(
                "./wmt16.recordio") as writer:
            for batch in reader():
                for tensor in prepare_batch_input(
                        batch, ModelHyperParams.src_pad_idx,
                        ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head):
                    t = fluid.LoDTensor()
                    t.set(tensor, fluid.CPUPlace())
                    writer.append_tensor(t)
                writer.complete_append_tensor()

Y
Yu Yang 已提交
464
    @unittest.skip("transformer is buggy in multi gpu")
Y
Yu Yang 已提交
465 466
    def test_main(self):
        self.check_network_convergence(transformer)
467 468 469 470 471 472 473 474 475 476


class ParallelExecutorTestingDuringTraining(unittest.TestCase):
    def test_parallel_testing(self):
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
            loss = simple_fc_net(True)
            test_program = main.clone(for_test=True)

D
Dang Qingqing 已提交
477
            opt = fluid.optimizer.SGD(learning_rate=0.001)
478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503
            opt.minimize(loss)

            batch_size = 32
            image = numpy.random.normal(size=(batch_size,
                                              784)).astype('float32')
            label = numpy.random.randint(0, 10, (batch_size, 1), dtype="int64")

            place = fluid.CUDAPlace(0)
            exe = fluid.Executor(place)
            exe.run(startup)
            feed_dict = {'image': image, 'label': label}

            train_exe = fluid.ParallelExecutor(
                use_cuda=True, loss_name=loss.name, main_program=main)

            test_exe = fluid.ParallelExecutor(
                use_cuda=True,
                main_program=test_program,
                share_vars_from=train_exe)

            for i in xrange(5):
                test_loss, = test_exe.run([loss.name], feed_dict=feed_dict)
                test_loss = numpy.array(test_loss)

                train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
                train_loss = numpy.array(train_loss)
D
Dang Qingqing 已提交
504 505 506 507 508
                self.assertTrue(
                    numpy.allclose(
                        train_loss, test_loss, atol=1e-8),
                    "Train loss: " + str(train_loss) + "\n Test loss:" +
                    str(test_loss))
Y
Yu Yang 已提交
509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653


import paddle.dataset.conll05 as conll05
import paddle.fluid as fluid

word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict)
label_dict_len = len(label_dict)
pred_dict_len = len(verb_dict)
mark_dict_len = 2
word_dim = 32
mark_dim = 5
hidden_dim = 512
depth = 8
mix_hidden_lr = 1e-3
embedding_name = 'emb'


def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
            **ignored):
    # 8 features
    predicate_embedding = fluid.layers.embedding(
        input=predicate,
        size=[pred_dict_len, word_dim],
        dtype='float32',
        param_attr='vemb')

    mark_embedding = fluid.layers.embedding(
        input=mark, size=[mark_dict_len, mark_dim], dtype='float32')

    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
    emb_layers = [
        fluid.layers.embedding(
            size=[word_dict_len, word_dim],
            input=x,
            param_attr=fluid.ParamAttr(
                name=embedding_name, trainable=False)) for x in word_input
    ]
    emb_layers.append(predicate_embedding)
    emb_layers.append(mark_embedding)

    hidden_0_layers = [
        fluid.layers.fc(input=emb, size=hidden_dim, act='tanh')
        for emb in emb_layers
    ]

    hidden_0 = fluid.layers.sums(input=hidden_0_layers)

    lstm_0 = fluid.layers.dynamic_lstm(
        input=hidden_0,
        size=hidden_dim,
        candidate_activation='relu',
        gate_activation='sigmoid',
        cell_activation='sigmoid')

    # stack L-LSTM and R-LSTM with direct edges
    input_tmp = [hidden_0, lstm_0]

    for i in range(1, depth):
        mix_hidden = fluid.layers.sums(input=[
            fluid.layers.fc(input=input_tmp[0], size=hidden_dim, act='tanh'),
            fluid.layers.fc(input=input_tmp[1], size=hidden_dim, act='tanh')
        ])

        lstm = fluid.layers.dynamic_lstm(
            input=mix_hidden,
            size=hidden_dim,
            candidate_activation='relu',
            gate_activation='sigmoid',
            cell_activation='sigmoid',
            is_reverse=((i % 2) == 1))

        input_tmp = [mix_hidden, lstm]

    feature_out = fluid.layers.sums(input=[
        fluid.layers.fc(input=input_tmp[0], size=label_dict_len, act='tanh'),
        fluid.layers.fc(input=input_tmp[1], size=label_dict_len, act='tanh')
    ])

    return feature_out


class TestCRFModel(unittest.TestCase):
    def test_all(self):
        main = fluid.Program()
        startup = fluid.Program()
        with fluid.program_guard(main, startup):
            word = fluid.layers.data(
                name='word_data', shape=[1], dtype='int64', lod_level=1)
            predicate = fluid.layers.data(
                name='verb_data', shape=[1], dtype='int64', lod_level=1)
            ctx_n2 = fluid.layers.data(
                name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
            ctx_n1 = fluid.layers.data(
                name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
            ctx_0 = fluid.layers.data(
                name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
            ctx_p1 = fluid.layers.data(
                name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
            ctx_p2 = fluid.layers.data(
                name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
            mark = fluid.layers.data(
                name='mark_data', shape=[1], dtype='int64', lod_level=1)
            feature_out = db_lstm(**locals())
            target = fluid.layers.data(
                name='target', shape=[1], dtype='int64', lod_level=1)
            crf_cost = fluid.layers.linear_chain_crf(
                input=feature_out,
                label=target,
                param_attr=fluid.ParamAttr(
                    name='crfw', learning_rate=1e-1))
            avg_cost = fluid.layers.mean(crf_cost)

            sgd_optimizer = fluid.optimizer.SGD(
                learning_rate=fluid.layers.exponential_decay(
                    learning_rate=0.01,
                    decay_steps=100000,
                    decay_rate=0.5,
                    staircase=True))
            sgd_optimizer.minimize(avg_cost)

            train_data = paddle.batch(
                paddle.reader.shuffle(
                    paddle.dataset.conll05.test(), buf_size=8192),
                batch_size=16)

            place = fluid.CUDAPlace(0)
            exe = fluid.Executor(place)
            exe.run(startup)

            pe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)

            feeder = fluid.DataFeeder(
                feed_list=[
                    word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate,
                    mark, target
                ],
                place=fluid.CPUPlace())

            data = train_data()
            for i in xrange(10):
                cur_batch = next(data)
                print map(numpy.array,
                          pe.run(feed_dict=feeder.feed(cur_batch),
                                 fetch_list=[avg_cost.name]))[0]