Merge branch 'develop' of https://github.com/PaddlePaddle/models into new_method

f258a876 · buaawht · 1af463bf · 26b3788b · f258a876 · f258a876
31 changed file
--- a/.gitignore
+++ b/.gitignore
 .DS_Store
 *.pyc
+.*~
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,7 +17,7 @@ addons:
      - python-pip
      - python2.7-dev
      - clang-format-3.8
-  ssh_known_hosts: 52.76.173.135
+  ssh_known_hosts: 13.229.163.131
 before_install:
  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  - sudo pip install -U virtualenv pre-commit pip

--- a/fluid/DeepASR/tools/profile.py
+++ b/fluid/DeepASR/tools/profile.py
@@ -168,7 +168,7 @@ def profile(args):
                start_time = time.time()
                frames_seen = 0
            # load_data
-            (features, labels, lod) = batch_data
+            (features, labels, lod, _) = batch_data
            feature_t.set(features, place)
            feature_t.set_lod([lod])
            label_t.set(labels, place)

--- a/fluid/DeepASR/train.py
+++ b/fluid/DeepASR/train.py
@@ -192,7 +192,7 @@ def train(args):
                test_data_reader.batch_iterator(args.batch_size,
                                                args.minimum_batch_size)):
            # load_data
-            (features, labels, lod) = batch_data
+            (features, labels, lod, _) = batch_data
            feature_t.set(features, place)
            feature_t.set_lod([lod])
            label_t.set(labels, place)

--- a/fluid/image_classification/caffe2fluid/README.md
+++ b/fluid/image_classification/caffe2fluid/README.md
@@ -18,19 +18,19 @@ This tool is used to convert a Caffe model to Fluid model


 ### Tested models
- Lenet on mnist dataset
+- Lenet

 - ResNets:(ResNet-50, ResNet-101, ResNet-152)
-    model addr: `https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777`_
+[model addr](https://onedrive.live.com/?authkey=%21AAFW2-FVoxeVRck&id=4006CBB8476FF777%2117887&cid=4006CBB8476FF777)

 - GoogleNet:
-    model addr: `https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034`_
+[model addr](https://gist.github.com/jimmie33/7ea9f8ac0da259866b854460f4526034)

 - VGG:
-    model addr: `https://gist.github.com/ksimonyan/211839e770f7b538e2d8`_
+[model addr](https://gist.github.com/ksimonyan/211839e770f7b538e2d8)

 - AlexNet:
-    model addr: `https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet`_
+[model addr](https://github.com/BVLC/caffe/tree/master/models/bvlc_alexnet)

 ### Notes
 Some of this code come from here: https://github.com/ethereon/caffe-tensorflow
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/compare.py
+#!/usr/bin/python
+
+#
+#a tool to compare tensors in two files or two directories
+#
+
+import sys
+import os
+
+
+def walk_dir(rootdir):
+    for subdir, dirs, files in os.walk(rootdir):
+        for file in files:
+            yield file
+
+
+def calc_diff(f1, f2):
+    import numpy as np
+
+    d1 = np.load(f1).flatten()
+    d2 = np.load(f2).flatten()
+
+    d1_num = reduce(lambda x, y: x * y, d1.shape)
+    d2_num = reduce(lambda x, y: x * y, d2.shape)
+    if d1_num != d2_num:
+        print d1.shape
+        print d2.shape
+        assert (d1_num == d2_num), "their shape is not consistent"
+
+    try:
+        df = np.abs(d1 - d2)
+        max_df = np.max(df)
+        sq_df = np.mean(df * df)
+        return max_df, sq_df
+    except Exception as e:
+        return -1.0, -1.0
+
+
+def compare(path1, path2):
+    def diff(f1, f2):
+        max_df, sq_df = calc_diff(f1, f2)
+        print('compare %s <=> %s with result[max_df:%.4e, sq_df:%.4e]' %
+              (f1, f2, max_df, sq_df))
+        assert (max_df < 1e-5), \
+                'max_df is too large with value[%.6e]' % (max_df)
+        assert (sq_df < 1e-10), \
+                'sq_df is too large with value[%.6e]' % (sq_df)
+
+    if os.path.exists(path1) is False:
+        print('not found %s' % (path1))
+        return 1
+    elif os.path.exists(path2) is False:
+        print('not found %s' % (path2))
+        return 1
+
+    if path1.find('.npy') > 0 and path2.find('.npy') > 0:
+        diff(path1, path2)
+        return
+
+    for f in walk_dir(path2):
+        if f.find('.npy') < 0:
+            continue
+
+        f1 = os.path.join(path1, f)
+        f2 = os.path.join(path2, f)
+        diff(f1, f2)
+
+    print('all checking succeed to pass')
+    return 0
+
+
+if __name__ == "__main__":
+    if len(sys.argv) == 1:
+        path1 = 'lenet.tf/results'
+        path2 = 'lenet.paddle/results'
+    elif len(sys.argv) == 3:
+        path1 = sys.argv[1]
+        path2 = sys.argv[2]
+    else:
+        print('usage:')
+        print(' %s [path1] [path2]' % (sys.argv[0]))
+        exit(1)
+
+    print('compare inner result in %s %s' % (path1, path2))
+    exit(compare(path1, path2))
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/diff.sh
+#!/bin/bash
+
+#
+#function:
+#   a tool used to check the difference of models' results generated by caffe model and paddle model
+#
+#howto:
+#   bash diff.sh resnet50 #when this has been finished, you can get the difference in precision
+#
+#notes:
+#   0, in order to infer using caffe, we need pycaffe installed
+#   1, prepare your caffe model in 'models.caffe/', eg: 'model.caffe/resnet101/resnet101.[prototxt|caffemodel]'
+#   2, converted paddle model will be in 'models'
+#   3, results of layers will be stored in 'results/${model_name}.[paddle|caffe]'
+#   4, only the last layer will be checked by default
+
+model_name="resnet50"
+results_root="results/"
+
+if [[ -n $1 ]];then
+    if [ $1 = "-h" ];then
+        echo "usage:"
+        echo "  bash $0 [model_name]"
+        echo "  eg:bash $0 resnet50"
+        exit 0
+    fi
+    model_name=$1
+fi
+
+mkdir -p $results_root
+
+model_prototxt="models.caffe/$model_name/${model_name}.prototxt"
+model_caffemodel="models.caffe/${model_name}/${model_name}.caffemodel"
+
+#1, dump layers' results from paddle
+paddle_results="$results_root/${model_name}.paddle"
+rm -rf $paddle_results
+rm -rf "results.paddle"
+bash run.sh $model_name ./models.caffe/$model_name ./models/$model_name
+if [[ $? -ne 0 ]] || [[ ! -e "results.paddle" ]];then
+    echo "not found paddle's results, maybe failed to convert"
+    exit 1
+fi
+mv results.paddle $paddle_results
+
+#2, dump layers' results from caffe
+caffe_results="$results_root/${model_name}.caffe"
+rm -rf $caffe_results
+rm -rf "results.caffe"
+cfpython ./infer.py caffe $model_prototxt $model_caffemodel $paddle_results/data.npy
+if [[ $? -ne 0 ]] || [[ ! -e "results.caffe" ]];then
+    echo "not found caffe's results, maybe failed to do inference with caffe"
+    exit 1
+fi
+mv results.caffe $caffe_results
+
+#3, extract layer names
+cat $model_prototxt | grep name | perl -ne 'if(/^\s*name:\s+\"([^\"]+)/){ print $1."\n";}' >.layer_names
+
+#4, compare one by one
+for i in $(cat ".layer_names" | tail -n1);do
+    echo "process $i"
+    python compare.py $caffe_results/${i}.npy $paddle_results/${i}.npy
+done
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/infer.py
@@ -10,8 +10,11 @@ import os
 import sys
 import inspect
 import numpy as np
-import paddle.v2 as paddle
-import paddle.v2.fluid as fluid
+
+
+def import_fluid():
+    import paddle.fluid as fluid
+    return fluid


 def load_data(imgfile, shape):
@@ -52,8 +55,10 @@ def build_model(net_file, net_name):
        print(e)
        return None

-    input_name = 'data'
-    input_shape = MyNet.input_shapes()[input_name]
+    fluid = import_fluid()
+    inputs_dict = MyNet.input_shapes()
+    input_name = inputs_dict.keys()[0]
+    input_shape = inputs_dict[input_name]
    images = fluid.layers.data(name='image', shape=input_shape, dtype='float32')
    #label = fluid.layers.data(name='label', shape=[1], dtype='int64')

@@ -64,7 +69,7 @@ def build_model(net_file, net_name):

 def dump_results(results, names, root):
    if os.path.exists(root) is False:
-        os.path.mkdir(root)
+        os.mkdir(root)

    for i in range(len(names)):
        n = names[i]
@@ -73,9 +78,12 @@ def dump_results(results, names, root):
        np.save(filename + '.npy', res)


-def infer(net_file, net_name, model_file, imgfile, debug=False):
+def infer(net_file, net_name, model_file, imgfile, debug=True):
    """ do inference using a model which consist 'xxx.py' and 'xxx.npy'
    """
+
+    fluid = import_fluid()
+
    #1, build model
    net, input_shape = build_model(net_file, net_name)
    prediction = net.get_output()
@@ -109,34 +117,79 @@ def infer(net_file, net_name, model_file, imgfile, debug=False):
                      fetch_list=fetch_list_var)

    if debug is True:
-        dump_path = 'results.layers'
+        dump_path = 'results.paddle'
        dump_results(results, fetch_list_name, dump_path)
-        print('all results dumped to [%s]' % (dump_path))
+        print('all result of layers dumped to [%s]' % (dump_path))
    else:
        result = results[0]
        print('predicted class:', np.argmax(result))

+    return 0
+
+
+def caffe_infer(prototxt, caffemodel, datafile):
+    """ do inference using pycaffe for debug,
+        all intermediate results will be dumpped to 'results.caffe'
+    """
+    import caffe
+
+    net = caffe.Net(prototxt, caffemodel, caffe.TEST)
+    input_layer = net.blobs.keys()[0]
+    print('got name of input layer is:%s' % (input_layer))
+    input_shape = list(net.blobs[input_layer].data.shape[1:])
+
+    if '.npy' in datafile:
+        np_images = np.load(datafile)
+    else:
+        np_images = load_data(datafile, input_shape)
+
+    inputs = {input_layer: np_images}
+    net.forward_all(**inputs)
+
+    results = []
+    names = []
+    for k, v in net.blobs.items():
+        k = k.rstrip('_output')
+        k = k.replace('/', '_')
+        names.append(k)
+        results.append(v.data.copy())
+
+    dump_path = 'results.caffe'
+    dump_results(results, names, dump_path)
+    print('all result of layers dumped to [%s]' % (dump_path))
+    return 0
+

 if __name__ == "__main__":
    """ maybe more convenient to use 'run.sh' to call this tool
    """
    net_file = 'models/resnet50/resnet50.py'
    weight_file = 'models/resnet50/resnet50.npy'
-    imgfile = 'data/65.jpeg'
+    datafile = 'data/65.jpeg'
    net_name = 'ResNet50'

    argc = len(sys.argv)
-    if argc == 5:
+    if sys.argv[1] == 'caffe':
+        if len(sys.argv) != 5:
+            print('usage:')
+            print('\tpython %s caffe [prototxt] [caffemodel] [datafile]' %
+                  (sys.argv[0]))
+            sys.exit(1)
+        prototxt = sys.argv[2]
+        caffemodel = sys.argv[3]
+        datafile = sys.argv[4]
+        sys.exit(caffe_infer(prototxt, caffemodel, datafile))
+    elif argc == 5:
        net_file = sys.argv[1]
        weight_file = sys.argv[2]
-        imgfile = sys.argv[3]
+        datafile = sys.argv[3]
        net_name = sys.argv[4]
    elif argc > 1:
        print('usage:')
-        print('\tpython %s [net_file] [weight_file] [imgfile] [net_name]' %
+        print('\tpython %s [net_file] [weight_file] [datafile] [net_name]' %
              (sys.argv[0]))
        print('\teg:python %s %s %s %s %s' % (sys.argv[0], net_file,
-                                              weight_file, imgfile, net_name))
+                                              weight_file, datafile, net_name))
        sys.exit(1)

-    infer(net_file, net_name, weight_file, imgfile)
+    infer(net_file, net_name, weight_file, datafile)
--- a/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh
+++ b/fluid/image_classification/caffe2fluid/examples/imagenet/run.sh
@@ -3,7 +3,7 @@
 #function:
 #   a tool used to:
 #       1, convert a caffe model
-#       2, do inference using this model
+#       2, do inference(only in fluid) using this model
 #
 #usage:
 #   bash run.sh resnet50 ./models.caffe/resnet50 ./models/resnet50
@@ -65,7 +65,12 @@ if [[ -z $only_convert ]];then
        PYTHON=`which python`
    fi
    imgfile="data/65.jpeg"
-    net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/\"([^\"]+)\"/){ print $1."\n";}'`
+    #FIX ME:
+    #   only look the first line in prototxt file for the name of this network, maybe not correct
+    net_name=`grep "name" $proto_file | head -n1 | perl -ne 'if(/^\s*name\s*:\s*\"([^\"]+)\"/){ print $1."\n";}'`
+    if [[ -z $net_name ]];then
+        net_name="MyNet"
+    fi
    $PYTHON ./infer.py $net_file $weight_file $imgfile $net_name
    ret=$?
 fi

--- a/fluid/image_classification/caffe2fluid/kaffe/graph.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/graph.py
@@ -52,7 +52,10 @@ class Graph(object):
    def __init__(self, nodes=None, name=None):
        self.nodes = nodes or []
        self.node_lut = {node.name: node for node in self.nodes}
-        self.name = name
+        if name is None or name == '':
+            self.name = 'MyNet'
+        else:
+            self.name = name

    def add_node(self, node):
        self.nodes.append(node)

--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/network.py
@@ -4,7 +4,7 @@ import numpy as np


 def import_fluid():
-    import paddle.v2.fluid as fluid
+    import paddle.fluid as fluid
    return fluid


@@ -64,7 +64,7 @@ class Network(object):
        if os.path.isdir(data_path):
            assert (exe is not None), \
                'must provide a executor to load fluid model'
-            fluid.io.load_persistables_if_exist(executor=exe, dirname=data_path)
+            fluid.io.load_persistables(executor=exe, dirname=data_path)
            return True

        #load model from a npy file
@@ -161,56 +161,28 @@ class Network(object):
        output = fluid.layers.relu(x=input)
        return output

-    def _adjust_pad_if_needed(self, i_hw, k_hw, s_hw, p_hw):
-        #adjust the padding if needed
-        i_h, i_w = i_hw
-        k_h, k_w = k_hw
-        s_h, s_w = s_hw
-        p_h, p_w = p_hw
-
-        def is_consistent(i, k, s, p):
-            o = i + 2 * p - k
-            if o % s == 0:
-                return True
-            else:
-                return False
-
-        real_p_h = 0
-        real_p_w = 0
-        if is_consistent(i_h, k_h, s_h, p_h) is False:
-            real_p_h = int(k_h / 2)
-
-        if is_consistent(i_w, k_w, s_w, p_w) is False:
-            real_p_w = int(k_w / 2)
-
-        return [real_p_h, real_p_w]
-
    def pool(self, pool_type, input, k_h, k_w, s_h, s_w, name, padding):
        # Get the number of channels in the input
        in_hw = input.shape[2:]
        k_hw = [k_h, k_w]
        s_hw = [s_h, s_w]

-        if padding is None:
-            #fix bug about the difference between conv and pool
-            #more info: https://github.com/BVLC/caffe/issues/1318
-            padding = self._adjust_pad_if_needed(in_hw, k_hw, s_hw, [0, 0])
-
        fluid = import_fluid()
        output = fluid.layers.pool2d(
            input=input,
            pool_size=k_hw,
            pool_stride=s_hw,
            pool_padding=padding,
+            ceil_mode=True,
            pool_type=pool_type)
        return output

    @layer
-    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
+    def max_pool(self, input, k_h, k_w, s_h, s_w, name, padding=[0, 0]):
        return self.pool('max', input, k_h, k_w, s_h, s_w, name, padding)

    @layer
-    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=None):
+    def avg_pool(self, input, k_h, k_w, s_h, s_w, name, padding=[0, 0]):
        return self.pool('avg', input, k_h, k_w, s_h, s_w, name, padding)

    @layer
@@ -258,7 +230,12 @@ class Network(object):
        return output

    @layer
-    def batch_normalization(self, input, name, scale_offset=True, relu=False):
+    def batch_normalization(self,
+                            input,
+                            name,
+                            scale_offset=True,
+                            eps=1e-5,
+                            relu=False):
        # NOTE: Currently, only inference is supported
        fluid = import_fluid()
        prefix = name + '_'
@@ -276,7 +253,7 @@ class Network(object):
            bias_attr=bias_attr,
            moving_mean_name=mean_name,
            moving_variance_name=variance_name,
-            epsilon=1e-5,
+            epsilon=eps,
            act='relu' if relu is True else None)

        return output

--- a/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
+++ b/fluid/image_classification/caffe2fluid/kaffe/paddle/transformer.py
@@ -142,7 +142,13 @@ class TensorFlowMapper(NodeMapper):

    def map_batch_norm(self, node):
        scale_offset = len(node.data) == 4
-        kwargs = {} if scale_offset else {'scale_offset': False}
+
+        #this default value comes from caffe's param in batch_norm
+        default_eps = 1e-5
+        kwargs = {'scale_offset': scale_offset}
+        if node.parameters.eps != default_eps:
+            kwargs['eps'] = node.parameters.eps
+
        return MaybeActivated(
            node, default=False)('batch_normalization', **kwargs)

@@ -236,7 +242,7 @@ class TensorFlowEmitter(object):
        func_def = self.statement('@classmethod')
        func_def += self.statement('def convert(cls, npy_model, fluid_path):')
        self.indent()
-        func_def += self.statement('import paddle.v2.fluid as fluid')
+        func_def += self.statement('fluid = import_fluid()')
        for l in codes:
            func_def += self.statement(l)
        return '\n' + func_def

--- a/fluid/image_classification/se_resnext.py
+++ b/fluid/image_classification/se_resnext.py
-import os
-import numpy as np
-import time
-import sys
 import paddle.v2 as paddle
 import paddle.fluid as fluid
-import reader


 def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1,
@@ -124,164 +119,3 @@ def SE_ResNeXt(input, class_dim, infer=False, layers=50):
        drop = pool
    out = fluid.layers.fc(input=drop, size=class_dim, act='softmax')
    return out
-
-
-def train(learning_rate,
-          batch_size,
-          num_passes,
-          init_model=None,
-          model_save_dir='model',
-          parallel=True,
-          use_nccl=True,
-          lr_strategy=None,
-          layers=50):
-    class_dim = 1000
-    image_shape = [3, 224, 224]
-
-    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    if parallel:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
-
-        with pd.do():
-            image_ = pd.read_input(image)
-            label_ = pd.read_input(label)
-            out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers)
-            cost = fluid.layers.cross_entropy(input=out, label=label_)
-            avg_cost = fluid.layers.mean(x=cost)
-            acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
-            acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
-            pd.write_output(avg_cost)
-            pd.write_output(acc_top1)
-            pd.write_output(acc_top5)
-
-        avg_cost, acc_top1, acc_top5 = pd()
-        avg_cost = fluid.layers.mean(x=avg_cost)
-        acc_top1 = fluid.layers.mean(x=acc_top1)
-        acc_top5 = fluid.layers.mean(x=acc_top5)
-    else:
-        out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
-        cost = fluid.layers.cross_entropy(input=out, label=label)
-        avg_cost = fluid.layers.mean(x=cost)
-        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
-        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
-
-    if lr_strategy is None:
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=learning_rate,
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
-    else:
-        bd = lr_strategy["bd"]
-        lr = lr_strategy["lr"]
-        optimizer = fluid.optimizer.Momentum(
-            learning_rate=fluid.layers.piecewise_decay(
-                boundaries=bd, values=lr),
-            momentum=0.9,
-            regularization=fluid.regularizer.L2Decay(1e-4))
-
-    opts = optimizer.minimize(avg_cost)
-    fluid.memory_optimize(fluid.default_main_program())
-
-    inference_program = fluid.default_main_program().clone()
-    with fluid.program_guard(inference_program):
-        inference_program = fluid.io.get_inference_program(
-            [avg_cost, acc_top1, acc_top5])
-
-    place = fluid.CUDAPlace(0)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    if init_model is not None:
-        fluid.io.load_persistables(exe, init_model)
-
-    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
-    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
-    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
-
-    for pass_id in range(num_passes):
-        train_info = [[], [], []]
-        test_info = [[], [], []]
-        for batch_id, data in enumerate(train_reader()):
-            t1 = time.time()
-            loss, acc1, acc5 = exe.run(
-                fluid.default_main_program(),
-                feed=feeder.feed(data),
-                fetch_list=[avg_cost, acc_top1, acc_top5])
-            t2 = time.time()
-            period = t2 - t1
-            train_info[0].append(loss[0])
-            train_info[1].append(acc1[0])
-            train_info[2].append(acc5[0])
-            if batch_id % 10 == 0:
-                print("Pass {0}, trainbatch {1}, loss {2}, \
-                       acc1 {3}, acc5 {4} time {5}"
-                                                   .format(pass_id, \
-                       batch_id, loss[0], acc1[0], acc5[0], \
-                       "%2.2f sec" % period))
-                sys.stdout.flush()
-
-        train_loss = np.array(train_info[0]).mean()
-        train_acc1 = np.array(train_info[1]).mean()
-        train_acc5 = np.array(train_info[2]).mean()
-        for data in test_reader():
-            t1 = time.time()
-            loss, acc1, acc5 = exe.run(
-                inference_program,
-                feed=feeder.feed(data),
-                fetch_list=[avg_cost, acc_top1, acc_top5])
-            t2 = time.time()
-            period = t2 - t1
-            test_info[0].append(loss[0])
-            test_info[1].append(acc1[0])
-            test_info[2].append(acc5[0])
-            if batch_id % 10 == 0:
-                print("Pass {0},testbatch {1},loss {2}, \
-                       acc1 {3},acc5 {4},time {5}"
-                                                  .format(pass_id, \
-                       batch_id, loss[0], acc1[0], acc5[0], \
-                       "%2.2f sec" % period))
-                sys.stdout.flush()
-
-        test_loss = np.array(test_info[0]).mean()
-        test_acc1 = np.array(test_info[1]).mean()
-        test_acc5 = np.array(test_info[2]).mean()
-
-        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
-               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
-                                                           .format(pass_id, \
-              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
-              test_acc5))
-        sys.stdout.flush()
-
-        model_path = os.path.join(model_save_dir, str(pass_id))
-        if not os.path.isdir(model_path):
-            os.makedirs(model_path)
-        fluid.io.save_persistables(exe, model_path)
-
-
-if __name__ == '__main__':
-    epoch_points = [30, 60, 90]
-    total_images = 1281167
-    batch_size = 256
-    step = int(total_images / batch_size + 1)
-    bd = [e * step for e in epoch_points]
-    lr = [0.1, 0.01, 0.001, 0.0001]
-
-    lr_strategy = {"bd": bd, "lr": lr}
-
-    use_nccl = True
-    # layers: 50, 152
-    layers = 50
-
-    train(
-        learning_rate=0.1,
-        batch_size=batch_size,
-        num_passes=120,
-        init_model=None,
-        parallel=True,
-        use_nccl=True,
-        lr_strategy=lr_strategy,
-        layers=layers)
--- a/fluid/image_classification/train.py
+++ b/fluid/image_classification/train.py
+import os
+import numpy as np
+import time
+import sys
+import paddle.v2 as paddle
+import paddle.fluid as fluid
+from se_resnext import SE_ResNeXt
+import reader
+
+import argparse
+import functools
+from utility import add_arguments, print_arguments
+
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('batch_size',   int,  256, "Minibatch size.")
+add_arg('num_layers',   int,  50,  "How many layers for SE-ResNeXt model.")
+add_arg('with_mem_opt', bool, True, "Whether to use memory optimization or not.")
+add_arg('parallel_exe', bool, True, "Whether to use ParallelExecutor to train or not.")
+
+def train_paralle_do(args,
+                     learning_rate,
+                     batch_size,
+                     num_passes,
+                     init_model=None,
+                     model_save_dir='model',
+                     parallel=True,
+                     use_nccl=True,
+                     lr_strategy=None,
+                     layers=50):
+    class_dim = 1000
+    image_shape = [3, 224, 224]
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
+
+        with pd.do():
+            image_ = pd.read_input(image)
+            label_ = pd.read_input(label)
+            out = SE_ResNeXt(input=image_, class_dim=class_dim, layers=layers)
+            cost = fluid.layers.cross_entropy(input=out, label=label_)
+            avg_cost = fluid.layers.mean(x=cost)
+            acc_top1 = fluid.layers.accuracy(input=out, label=label_, k=1)
+            acc_top5 = fluid.layers.accuracy(input=out, label=label_, k=5)
+            pd.write_output(avg_cost)
+            pd.write_output(acc_top1)
+            pd.write_output(acc_top5)
+
+        avg_cost, acc_top1, acc_top5 = pd()
+        avg_cost = fluid.layers.mean(x=avg_cost)
+        acc_top1 = fluid.layers.mean(x=acc_top1)
+        acc_top5 = fluid.layers.mean(x=acc_top5)
+    else:
+        out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
+        cost = fluid.layers.cross_entropy(input=out, label=label)
+        avg_cost = fluid.layers.mean(x=cost)
+        acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+        acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+
+    if lr_strategy is None:
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    else:
+        bd = lr_strategy["bd"]
+        lr = lr_strategy["lr"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+
+    inference_program = fluid.default_main_program().clone(for_test=True)
+
+    opts = optimizer.minimize(avg_cost)
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program())
+        fluid.memory_optimize(inference_program)
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if init_model is not None:
+        fluid.io.load_persistables(exe, init_model)
+
+    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
+    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+
+    for pass_id in range(num_passes):
+        train_info = [[], [], []]
+        test_info = [[], [], []]
+        for batch_id, data in enumerate(train_reader()):
+            t1 = time.time()
+            loss, acc1, acc5 = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, acc_top1, acc_top5])
+            t2 = time.time()
+            period = t2 - t1
+            train_info[0].append(loss[0])
+            train_info[1].append(acc1[0])
+            train_info[2].append(acc5[0])
+            if batch_id % 10 == 0:
+                print("Pass {0}, trainbatch {1}, loss {2}, \
+                       acc1 {3}, acc5 {4} time {5}"
+                                                   .format(pass_id, \
+                       batch_id, loss[0], acc1[0], acc5[0], \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        train_loss = np.array(train_info[0]).mean()
+        train_acc1 = np.array(train_info[1]).mean()
+        train_acc5 = np.array(train_info[2]).mean()
+        for data in test_reader():
+            t1 = time.time()
+            loss, acc1, acc5 = exe.run(
+                inference_program,
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost, acc_top1, acc_top5])
+            t2 = time.time()
+            period = t2 - t1
+            test_info[0].append(loss[0])
+            test_info[1].append(acc1[0])
+            test_info[2].append(acc5[0])
+            if batch_id % 10 == 0:
+                print("Pass {0},testbatch {1},loss {2}, \
+                       acc1 {3},acc5 {4},time {5}"
+                                                  .format(pass_id, \
+                       batch_id, loss[0], acc1[0], acc5[0], \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        test_loss = np.array(test_info[0]).mean()
+        test_acc1 = np.array(test_info[1]).mean()
+        test_acc5 = np.array(test_info[2]).mean()
+
+        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
+               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
+                                                           .format(pass_id, \
+              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
+              test_acc5))
+        sys.stdout.flush()
+
+        model_path = os.path.join(model_save_dir, str(pass_id))
+        if not os.path.isdir(model_path):
+            os.makedirs(model_path)
+        fluid.io.save_persistables(exe, model_path)
+
+def train_parallel_exe(args,
+                       learning_rate,
+                       batch_size,
+                       num_passes,
+                       init_model=None,
+                       model_save_dir='model',
+                       parallel=True,
+                       use_nccl=True,
+                       lr_strategy=None,
+                       layers=50):
+    class_dim = 1000
+    image_shape = [3, 224, 224]
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers)
+    cost = fluid.layers.cross_entropy(input=out, label=label)
+    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
+    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+
+    if lr_strategy is None:
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=learning_rate,
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+    else:
+        bd = lr_strategy["bd"]
+        lr = lr_strategy["lr"]
+        optimizer = fluid.optimizer.Momentum(
+            learning_rate=fluid.layers.piecewise_decay(
+                boundaries=bd, values=lr),
+            momentum=0.9,
+            regularization=fluid.regularizer.L2Decay(1e-4))
+
+    opts = optimizer.minimize(avg_cost)
+
+    if args.with_mem_opt:
+        fluid.memory_optimize(fluid.default_main_program())
+        fluid.memory_optimize(test_program)
+
+    place = fluid.CUDAPlace(0)
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if init_model is not None:
+        fluid.io.load_persistables(exe, init_model)
+
+    train_reader = paddle.batch(reader.train(), batch_size=batch_size)
+    test_reader = paddle.batch(reader.test(), batch_size=batch_size)
+    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
+
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
+    test_exe = fluid.ParallelExecutor(
+        use_cuda=True,
+        main_program=test_program,
+        share_vars_from=train_exe)
+
+    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]
+
+    for pass_id in range(num_passes):
+        train_info = [[], [], []]
+        test_info = [[], [], []]
+        for batch_id, data in enumerate(train_reader()):
+            t1 = time.time()
+            loss, acc1, acc5 = train_exe.run(
+                fetch_list,
+                feed_dict=feeder.feed(data))
+            t2 = time.time()
+            period = t2 - t1
+            loss = np.mean(np.array(loss))
+            acc1 = np.mean(np.array(acc1))
+            acc5 = np.mean(np.array(acc5))
+            train_info[0].append(loss)
+            train_info[1].append(acc1)
+            train_info[2].append(acc5)
+            if batch_id % 10 == 0:
+                print("Pass {0}, trainbatch {1}, loss {2}, \
+                       acc1 {3}, acc5 {4} time {5}"
+                                                   .format(pass_id, \
+                       batch_id, loss, acc1, acc5, \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        train_loss = np.array(train_info[0]).mean()
+        train_acc1 = np.array(train_info[1]).mean()
+        train_acc5 = np.array(train_info[2]).mean()
+        for data in test_reader():
+            t1 = time.time()
+            loss, acc1, acc5 = test_exe.run(
+                fetch_list,
+                feed_dict=feeder.feed(data))
+            t2 = time.time()
+            period = t2 - t1
+            loss = np.mean(np.array(loss))
+            acc1 = np.mean(np.array(acc1))
+            acc5 = np.mean(np.array(acc5))
+            test_info[0].append(loss)
+            test_info[1].append(acc1)
+            test_info[2].append(acc5)
+            if batch_id % 10 == 0:
+                print("Pass {0},testbatch {1},loss {2}, \
+                       acc1 {3},acc5 {4},time {5}"
+                                                  .format(pass_id, \
+                       batch_id, loss, acc1, acc5, \
+                       "%2.2f sec" % period))
+                sys.stdout.flush()
+
+        test_loss = np.array(test_info[0]).mean()
+        test_acc1 = np.array(test_info[1]).mean()
+        test_acc5 = np.array(test_info[2]).mean()
+
+        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \
+               test_loss {4}, test_acc1 {5}, test_acc5 {6}"
+                                                           .format(pass_id, \
+              train_loss, train_acc1, train_acc5, test_loss, test_acc1, \
+              test_acc5))
+        sys.stdout.flush()
+
+        model_path = os.path.join(model_save_dir, str(pass_id))
+        if not os.path.isdir(model_path):
+            os.makedirs(model_path)
+        fluid.io.save_persistables(exe, model_path)
+
+
+
+
+if __name__ == '__main__':
+    args = parser.parse_args()
+    print_arguments(args)
+
+    epoch_points = [30, 60, 90]
+    total_images = 1281167
+    batch_size = args.batch_size
+    step = int(total_images / batch_size + 1)
+    bd = [e * step for e in epoch_points]
+    lr = [0.1, 0.01, 0.001, 0.0001]
+
+    lr_strategy = {"bd": bd, "lr": lr}
+
+    use_nccl = True
+    # layers: 50, 152
+    layers = args.num_layers
+    method = train_parallel_exe if args.parallel_exe else train_parallel_do
+    method(args,
+           learning_rate=0.1,
+           batch_size=batch_size,
+           num_passes=120,
+           init_model=None,
+           parallel=True,
+           use_nccl=True,
+           lr_strategy=lr_strategy,
+           layers=layers)
--- a/fluid/image_classification/utility.py
+++ b/fluid/image_classification/utility.py
+"""Contains common utility functions."""
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import distutils.util
+import numpy as np
+from paddle.fluid import core
+
+
+def print_arguments(args):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).iteritems()):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
--- a/fluid/neural_machine_translation/transformer/config.py
+++ b/fluid/neural_machine_translation/transformer/config.py
@@ -15,6 +15,9 @@ class TrainTaskConfig(object):
    # the parameters for learning rate scheduling.
    warmup_steps = 4000

+    # the flag indicating to use average loss or sum loss when training.
+    use_avg_cost = False
+
    # the directory for saving trained models.
    model_dir = "trained_models"

@@ -22,8 +25,7 @@ class TrainTaskConfig(object):
 class InferTaskConfig(object):
    use_gpu = False
    # the number of examples in one run for sequence generation.
-    # currently the batch size can only be set to 1.
-    batch_size = 1
+    batch_size = 10

    # the parameters for beam search.
    beam_size = 5
@@ -31,37 +33,38 @@ class InferTaskConfig(object):
    # the number of decoded sentences to output.
    n_best = 1

+    # the flags indicating whether to output the special tokens.
+    output_bos = False
+    output_eos = False
+    output_unk = False
+
    # the directory for loading the trained model.
    model_path = "trained_models/pass_1.infer.model"


 class ModelHyperParams(object):
-    # Dictionary size for source and target language. This model directly uses
-    # paddle.dataset.wmt16 in which <bos>, <eos> and <unk> token has
-    # alreay been added, but the <pad> token is not added. Transformer requires
-    # sequences in a mini-batch are padded to have the same length. A <pad> token is
-    # added into the original dictionary in paddle.dateset.wmt16.
+    # This model directly uses paddle.dataset.wmt16 in which <bos>, <eos> and
+    # <unk> token has alreay been added. As for the <pad> token, any token
+    # included in dict can be used to pad, since the paddings' loss will be
+    # masked out and make no effect on parameter gradients.

    # size of source word dictionary.
    src_vocab_size = 10000
-    # index for <pad> token in source language.
-    src_pad_idx = src_vocab_size

    # size of target word dictionay
    trg_vocab_size = 10000
-    # index for <pad> token in target language.
-    trg_pad_idx = trg_vocab_size

    # index for <bos> token
    bos_idx = 0
    # index for <eos> token
    eos_idx = 1
+    # index for <unk> token
+    unk_idx = 2

-    # position value corresponding to the <pad> token.
-    pos_pad_idx = 0
-
-    # max length of sequences. It should plus 1 to include position
-    # padding token for position encoding.
+    # max length of sequences.
+    # The size of position encoding table should at least plus 1, since the
+    # sinusoid position encoding starts from 1 and 0 can be used as the padding
+    # token for position encoding.
    max_length = 50

    # the dimension for word embeddings, which is also the last dimension of
@@ -93,6 +96,7 @@ encoder_input_data_names = (
    "src_word",
    "src_pos",
    "src_slf_attn_bias",
+    "src_data_shape",
    "src_slf_attn_pre_softmax_shape",
    "src_slf_attn_post_softmax_shape", )

@@ -102,6 +106,7 @@ decoder_input_data_names = (
    "trg_pos",
    "trg_slf_attn_bias",
    "trg_src_attn_bias",
+    "trg_data_shape",
    "trg_slf_attn_pre_softmax_shape",
    "trg_slf_attn_post_softmax_shape",
    "trg_src_attn_pre_softmax_shape",

--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -11,10 +11,26 @@ from config import InferTaskConfig, ModelHyperParams, \
 from train import pad_batch_data


-def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
-                    decoder, dec_in_names, dec_out_names, beam_size, max_length,
-                    n_best, batch_size, n_head, src_pad_idx, trg_pad_idx,
-                    bos_idx, eos_idx):
+def translate_batch(exe,
+                    src_words,
+                    encoder,
+                    enc_in_names,
+                    enc_out_names,
+                    decoder,
+                    dec_in_names,
+                    dec_out_names,
+                    beam_size,
+                    max_length,
+                    n_best,
+                    batch_size,
+                    n_head,
+                    d_model,
+                    src_pad_idx,
+                    trg_pad_idx,
+                    bos_idx,
+                    eos_idx,
+                    unk_idx,
+                    output_unk=True):
    """
    Run the encoder program once and run the decoder program multiple times to
    implement beam search externally.
@@ -25,9 +41,14 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
        src_pad_idx,
        n_head,
        is_target=False,
-        return_pos=True,
+        is_label=False,
        return_attn_bias=True,
        return_max_len=False)
+    # Append the data shape input to reshape the output of embedding layer.
+    enc_in_data = enc_in_data + [
+        np.array(
+            [-1, enc_in_data[2].shape[-1], d_model], dtype="int32")
+    ]
    # Append the shape inputs to reshape before and after softmax in encoder
    # self attention.
    enc_in_data = enc_in_data + [
@@ -44,11 +65,16 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
    scores = np.zeros((batch_size, beam_size), dtype="float32")
    prev_branchs = [[] for i in range(batch_size)]
    next_ids = [[] for i in range(batch_size)]
-    # Use beam_map to map the instance idx in batch to beam idx, since the
+    # Use beam_inst_map to map beam idx to the instance idx in batch, since the
    # size of feeded batch is changing.
-    beam_map = range(batch_size)
+    beam_inst_map = {
+        beam_idx: inst_idx
+        for inst_idx, beam_idx in enumerate(range(batch_size))
+    }
+    # Use active_beams to recode the alive.
+    active_beams = range(batch_size)

-    def beam_backtrace(prev_branchs, next_ids, n_best=beam_size, add_bos=True):
+    def beam_backtrace(prev_branchs, next_ids, n_best=beam_size):
        """
        Decode and select n_best sequences for one instance by backtrace.
        """
@@ -60,7 +86,8 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
                seq.append(next_ids[j][k])
                k = prev_branchs[j][k]
            seq = seq[::-1]
-            seq = [bos_idx] + seq if add_bos else seq
+            # Add the <bos>, since next_ids don't include the <bos>.
+            seq = [bos_idx] + seq
            seqs.append(seq)
        return seqs

@@ -82,8 +109,14 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
                             [-1e9]).astype("float32")
        # This is used to remove attention on the paddings of source sequences.
        trg_src_attn_bias = np.tile(
-            src_slf_attn_bias[:, :, ::src_max_length, :],
-            [beam_size, 1, trg_max_len, 1])
+            src_slf_attn_bias[:, :, ::src_max_length, :][:, np.newaxis],
+            [1, beam_size, 1, trg_max_len, 1]).reshape([
+                -1, src_slf_attn_bias.shape[1], trg_max_len,
+                src_slf_attn_bias.shape[-1]
+            ])
+        # Append the shape input to reshape the output of embedding layer.
+        trg_data_shape = np.array(
+            [batch_size * beam_size, trg_max_len, d_model], dtype="int32")
        # Append the shape inputs to reshape before and after softmax in
        # decoder self attention.
        trg_slf_attn_pre_softmax_shape = np.array(
@@ -96,26 +129,27 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
            [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
        trg_src_attn_post_softmax_shape = np.array(
            trg_src_attn_bias.shape, dtype="int32")
-        enc_output = np.tile(enc_output, [beam_size, 1, 1])
+        enc_output = np.tile(
+            enc_output[:, np.newaxis], [1, beam_size, 1, 1]).reshape(
+                [-1, enc_output.shape[-2], enc_output.shape[-1]])
        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
-            trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, \
-            trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, \
-            enc_output
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output

-    def update_dec_in_data(dec_in_data, next_ids, active_beams):
+    def update_dec_in_data(dec_in_data, next_ids, active_beams, beam_inst_map):
        """
        Update the input data of decoder mainly by slicing from the previous
        input data and dropping the finished instance beams.
        """
        trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
-            trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, \
-            trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, \
-            enc_output = dec_in_data
-        trg_cur_len = len(next_ids[0]) + 1  # include the <bos>
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output = dec_in_data
+        trg_cur_len = trg_slf_attn_bias.shape[-1] + 1
        trg_words = np.array(
            [
-                beam_backtrace(
-                    prev_branchs[beam_idx], next_ids[beam_idx], add_bos=True)
+                beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx])
                for beam_idx in active_beams
            ],
            dtype="int64")
@@ -123,6 +157,7 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
        trg_pos = np.array(
            [range(1, trg_cur_len + 1)] * len(active_beams) * beam_size,
            dtype="int64").reshape([-1, 1])
+        active_beams = [beam_inst_map[beam_idx] for beam_idx in active_beams]
        active_beams_indice = (
            (np.array(active_beams) * beam_size)[:, np.newaxis] +
            np.array(range(beam_size))[np.newaxis, :]).flatten()
@@ -137,6 +172,10 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
        trg_src_attn_bias = np.tile(trg_src_attn_bias[
            active_beams_indice, :, ::trg_src_attn_bias.shape[2], :],
                                    [1, 1, trg_cur_len, 1])
+        # Append the shape input to reshape the output of embedding layer.
+        trg_data_shape = np.array(
+            [len(active_beams) * beam_size, trg_cur_len, d_model],
+            dtype="int32")
        # Append the shape inputs to reshape before and after softmax in
        # decoder self attention.
        trg_slf_attn_pre_softmax_shape = np.array(
@@ -151,9 +190,9 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
            trg_src_attn_bias.shape, dtype="int32")
        enc_output = enc_output[active_beams_indice, :, :]
        return trg_words, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
-            trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape, \
-            trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape, \
-            enc_output
+            trg_data_shape, trg_slf_attn_pre_softmax_shape, \
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape, \
+            trg_src_attn_post_softmax_shape, enc_output

    dec_in_data = init_dec_in_data(batch_size, beam_size, enc_in_data,
                                   enc_output)
@@ -162,13 +201,18 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
                              feed=dict(zip(dec_in_names, dec_in_data)),
                              fetch_list=dec_out_names)[0]
        predict_all = np.log(
-            predict_all.reshape([len(beam_map) * beam_size, i + 1, -1])[:,
-                                                                        -1, :])
-        predict_all = (predict_all + scores[beam_map].reshape(
-            [len(beam_map) * beam_size, -1])).reshape(
-                [len(beam_map), beam_size, -1])
+            predict_all.reshape([len(beam_inst_map) * beam_size, i + 1, -1])
+            [:, -1, :])
+        predict_all = (predict_all + scores[active_beams].reshape(
+            [len(beam_inst_map) * beam_size, -1])).reshape(
+                [len(beam_inst_map), beam_size, -1])
+        if not output_unk:  # To exclude the <unk> token.
+            predict_all[:, :, unk_idx] = -1e9
        active_beams = []
-        for inst_idx, beam_idx in enumerate(beam_map):
+        for beam_idx in range(batch_size):
+            if not beam_inst_map.has_key(beam_idx):
+                continue
+            inst_idx = beam_inst_map[beam_idx]
            predict = (predict_all[inst_idx, :, :]
                       if i != 0 else predict_all[inst_idx, 0, :]).flatten()
            top_k_indice = np.argpartition(predict, -beam_size)[-beam_size:]
@@ -181,13 +225,20 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
            next_ids[beam_idx].append(top_scores_ids % predict_all.shape[-1])
            if next_ids[beam_idx][-1][0] != eos_idx:
                active_beams.append(beam_idx)
-        beam_map = active_beams
-        if len(beam_map) == 0:
+        if len(active_beams) == 0:
            break
-        dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams)
+        dec_in_data = update_dec_in_data(dec_in_data, next_ids, active_beams,
+                                         beam_inst_map)
+        beam_inst_map = {
+            beam_idx: inst_idx
+            for inst_idx, beam_idx in enumerate(active_beams)
+        }

    # Decode beams and select n_best sequences for each instance by backtrace.
-    seqs = [beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)]
+    seqs = [
+        beam_backtrace(prev_branchs[beam_idx], next_ids[beam_idx], n_best)
+        for beam_idx in range(batch_size)
+    ]

    return seqs, scores[:, :n_best].tolist()

@@ -195,29 +246,24 @@ def translate_batch(exe, src_words, encoder, enc_in_names, enc_out_names,
 def main():
    place = fluid.CUDAPlace(0) if InferTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
-    # The current program desc is coupled with batch_size and the only
-    # supported batch size is 1 currently.
+
    encoder_program = fluid.Program()
-    model.batch_size = InferTaskConfig.batch_size
    with fluid.program_guard(main_program=encoder_program):
        enc_output = encoder(
-            ModelHyperParams.src_vocab_size + 1,
-            ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
-            ModelHyperParams.n_head, ModelHyperParams.d_key,
-            ModelHyperParams.d_value, ModelHyperParams.d_model,
-            ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
-            ModelHyperParams.src_pad_idx, ModelHyperParams.pos_pad_idx)
+            ModelHyperParams.src_vocab_size, ModelHyperParams.max_length + 1,
+            ModelHyperParams.n_layer, ModelHyperParams.n_head,
+            ModelHyperParams.d_key, ModelHyperParams.d_value,
+            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+            ModelHyperParams.dropout)

-    model.batch_size = InferTaskConfig.batch_size * InferTaskConfig.beam_size
    decoder_program = fluid.Program()
    with fluid.program_guard(main_program=decoder_program):
        predict = decoder(
-            ModelHyperParams.trg_vocab_size + 1,
-            ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
-            ModelHyperParams.n_head, ModelHyperParams.d_key,
-            ModelHyperParams.d_value, ModelHyperParams.d_model,
-            ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
-            ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+            ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1,
+            ModelHyperParams.n_layer, ModelHyperParams.n_head,
+            ModelHyperParams.d_key, ModelHyperParams.d_value,
+            ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
+            ModelHyperParams.dropout)

    # Load model parameters of encoder and decoder separately from the saved
    # transformer model.
@@ -254,17 +300,51 @@ def main():
    trg_idx2word = paddle.dataset.wmt16.get_dict(
        "de", dict_size=ModelHyperParams.trg_vocab_size, reverse=True)

+    def post_process_seq(seq,
+                         bos_idx=ModelHyperParams.bos_idx,
+                         eos_idx=ModelHyperParams.eos_idx,
+                         output_bos=InferTaskConfig.output_bos,
+                         output_eos=InferTaskConfig.output_eos):
+        """
+        Post-process the beam-search decoded sequence. Truncate from the first
+        <eos> and remove the <bos> and <eos> tokens currently.
+        """
+        eos_pos = len(seq) - 1
+        for i, idx in enumerate(seq):
+            if idx == eos_idx:
+                eos_pos = i
+                break
+        seq = seq[:eos_pos + 1]
+        return filter(
+            lambda idx: (output_bos or idx != bos_idx) and \
+                (output_eos or idx != eos_idx),
+            seq)
+
    for batch_id, data in enumerate(test_data()):
        batch_seqs, batch_scores = translate_batch(
-            exe, [item[0] for item in data], encoder_program,
-            encoder_input_data_names, [enc_output.name], decoder_program,
-            decoder_input_data_names, [predict.name], InferTaskConfig.beam_size,
-            InferTaskConfig.max_length, InferTaskConfig.n_best,
-            len(data), ModelHyperParams.n_head, ModelHyperParams.src_pad_idx,
-            ModelHyperParams.trg_pad_idx, ModelHyperParams.bos_idx,
-            ModelHyperParams.eos_idx)
+            exe,
+            [item[0] for item in data],
+            encoder_program,
+            encoder_input_data_names,
+            [enc_output.name],
+            decoder_program,
+            decoder_input_data_names,
+            [predict.name],
+            InferTaskConfig.beam_size,
+            InferTaskConfig.max_length,
+            InferTaskConfig.n_best,
+            len(data),
+            ModelHyperParams.n_head,
+            ModelHyperParams.d_model,
+            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
+            ModelHyperParams.eos_idx,  # Use eos_idx to pad.
+            ModelHyperParams.bos_idx,
+            ModelHyperParams.eos_idx,
+            ModelHyperParams.unk_idx,
+            output_unk=InferTaskConfig.output_unk)
        for i in range(len(batch_seqs)):
-            seqs = batch_seqs[i]
+            # Post-process the beam-search decoded sequences.
+            seqs = map(post_process_seq, batch_seqs[i])
            scores = batch_scores[i]
            for seq in seqs:
                print(" ".join([trg_idx2word[idx] for idx in seq]))

--- a/fluid/neural_machine_translation/transformer/model.py
+++ b/fluid/neural_machine_translation/transformer/model.py
@@ -7,9 +7,6 @@ import paddle.fluid.layers as layers
 from config import TrainTaskConfig, pos_enc_param_names, \
    encoder_input_data_names, decoder_input_data_names, label_data_names

-# FIXME(guosheng): Remove out the batch_size from the model.
-batch_size = TrainTaskConfig.batch_size
-

 def position_encoding_init(n_position, d_pos_vec):
    """
@@ -85,9 +82,10 @@ def multi_head_attention(queries,
            return x

        hidden_size = x.shape[-1]
-        # FIXME(guosheng): Decouple the program desc with batch_size.
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
        reshaped = layers.reshape(
-            x=x, shape=[batch_size, -1, n_head, hidden_size // n_head])
+            x=x, shape=[0, -1, n_head, hidden_size // n_head])

        # permuate the dimensions into:
        # [batch_size, n_head, max_sequence_len, hidden_size_per_head]
@@ -103,11 +101,11 @@ def multi_head_attention(queries,
            raise ValueError("Input(x) should be a 4-D Tensor.")

        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
-        # FIXME(guosheng): Decouple the program desc with batch_size.
+        # The value 0 in shape attr means copying the corresponding dimension
+        # size of the input as the output dimension size.
        return layers.reshape(
            x=trans_x,
-            shape=map(int,
-                      [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]]))
+            shape=map(int, [0, -1, trans_x.shape[2] * trans_x.shape[3]]))

    def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate):
        """
@@ -201,10 +199,9 @@ def prepare_encoder(src_word,
                    src_pos,
                    src_vocab_size,
                    src_emb_dim,
-                    src_pad_idx,
                    src_max_len,
                    dropout_rate=0.,
-                    pos_pad_idx=0,
+                    src_data_shape=None,
                    pos_enc_param_name=None):
    """Add word embeddings and position encodings.
    The output tensor has a shape of:
@@ -215,18 +212,17 @@ def prepare_encoder(src_word,
    src_word_emb = layers.embedding(
        src_word,
        size=[src_vocab_size, src_emb_dim],
-        padding_idx=src_pad_idx,
        param_attr=fluid.initializer.Normal(0., 1.))
    src_pos_enc = layers.embedding(
        src_pos,
        size=[src_max_len, src_emb_dim],
-        padding_idx=pos_pad_idx,
        param_attr=fluid.ParamAttr(
            name=pos_enc_param_name, trainable=False))
    enc_input = src_word_emb + src_pos_enc
-
-    # FIXME(guosheng): Decouple the program desc with batch_size.
-    enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim])
+    enc_input = layers.reshape(
+        x=enc_input,
+        shape=[-1, src_max_len, src_emb_dim],
+        actual_shape=src_data_shape)
    return layers.dropout(
        enc_input, dropout_prob=dropout_rate,
        is_test=False) if dropout_rate else enc_input
@@ -401,20 +397,23 @@ def decoder(dec_input,
 def make_inputs(input_data_names,
                n_head,
                d_model,
-                batch_size,
                max_length,
                is_pos,
                slf_attn_bias_flag,
                src_attn_bias_flag,
                enc_output_flag=False,
+                data_shape_flag=True,
                slf_attn_shape_flag=True,
                src_attn_shape_flag=True):
    """
    Define the input data layers for the transformer model.
    """
    input_layers = []
-    # The shapes here act as placeholder.
-    # The shapes set here is to pass the infer-shape in compile time.
+    batch_size = 1  # Only for the infer-shape in compile time.
+    # The shapes here act as placeholder and are set to pass the infer-shape in
+    # compile time.
+    # The actual data shape of word is:
+    # [batch_size * max_len_in_batch, 1]
    word = layers.data(
        name=input_data_names[len(input_layers)],
        shape=[batch_size * max_length, 1],
@@ -422,6 +421,8 @@ def make_inputs(input_data_names,
        append_batch_size=False)
    input_layers += [word]
    # This is used for position data or label weight.
+    # The actual data shape of pos is:
+    # [batch_size * max_len_in_batch, 1]
    pos = layers.data(
        name=input_data_names[len(input_layers)],
        shape=[batch_size * max_length, 1],
@@ -432,6 +433,8 @@ def make_inputs(input_data_names,
        # This input is used to remove attention weights on paddings for the
        # encoder and to remove attention weights on subsequent words for the
        # decoder.
+        # The actual data shape of slf_attn_bias_flag is:
+        # [batch_size, n_head, max_len_in_batch, max_len_in_batch]
        slf_attn_bias = layers.data(
            name=input_data_names[len(input_layers)],
            shape=[batch_size, n_head, max_length, max_length],
@@ -439,40 +442,60 @@ def make_inputs(input_data_names,
            append_batch_size=False)
        input_layers += [slf_attn_bias]
    if src_attn_bias_flag:
-        # This input is used to remove attention weights on paddings.
+        # This input is used to remove attention weights on paddings. It's used
+        # in encoder-decoder attention.
+        # The actual data shape of slf_attn_bias_flag is:
+        # [batch_size, n_head, trg_max_len_in_batch, src_max_len_in_batch]
        src_attn_bias = layers.data(
            name=input_data_names[len(input_layers)],
            shape=[batch_size, n_head, max_length, max_length],
            dtype="float32",
            append_batch_size=False)
        input_layers += [src_attn_bias]
+    if data_shape_flag:
+        # This input is used to reshape the output of embedding layer.
+        data_shape = layers.data(
+            name=input_data_names[len(input_layers)],
+            shape=[3],
+            dtype="int32",
+            append_batch_size=False)
+        input_layers += [data_shape]
    if slf_attn_shape_flag:
+        # This shape input is used to reshape before softmax in self attention.
        slf_attn_pre_softmax_shape = layers.data(
            name=input_data_names[len(input_layers)],
-            shape=[3],
+            shape=[2],
            dtype="int32",
            append_batch_size=False)
        input_layers += [slf_attn_pre_softmax_shape]
+        # This shape input is used to reshape after softmax in self attention.
        slf_attn_post_softmax_shape = layers.data(
            name=input_data_names[len(input_layers)],
-            shape=[3],
+            shape=[4],
            dtype="int32",
            append_batch_size=False)
        input_layers += [slf_attn_post_softmax_shape]
    if src_attn_shape_flag:
+        # This shape input is used to reshape before softmax in encoder-decoder
+        # attention.
        src_attn_pre_softmax_shape = layers.data(
            name=input_data_names[len(input_layers)],
-            shape=[3],
+            shape=[2],
            dtype="int32",
            append_batch_size=False)
        input_layers += [src_attn_pre_softmax_shape]
+        # This shape input is used to reshape after softmax in encoder-decoder
+        # attention.
        src_attn_post_softmax_shape = layers.data(
            name=input_data_names[len(input_layers)],
-            shape=[3],
+            shape=[4],
            dtype="int32",
            append_batch_size=False)
        input_layers += [src_attn_post_softmax_shape]
    if enc_output_flag:
+        # This input is used in independent decoder program for inference.
+        # The actual data shape of slf_attn_bias_flag is:
+        # [batch_size, max_len_in_batch, d_model]
        enc_output = layers.data(
            name=input_data_names[len(input_layers)],
            shape=[batch_size, max_length, d_model],
@@ -493,20 +516,17 @@ def transformer(
        d_value,
        d_model,
        d_inner_hid,
-        dropout_rate,
-        src_pad_idx,
-        trg_pad_idx,
-        pos_pad_idx, ):
-    enc_input_layers = make_inputs(
+        dropout_rate, ):
+    enc_inputs = make_inputs(
        encoder_input_data_names,
        n_head,
        d_model,
-        batch_size,
        max_length,
        is_pos=True,
        slf_attn_bias_flag=True,
        src_attn_bias_flag=False,
        enc_output_flag=False,
+        data_shape_flag=True,
        slf_attn_shape_flag=True,
        src_attn_shape_flag=False)

@@ -520,20 +540,18 @@ def transformer(
        d_model,
        d_inner_hid,
        dropout_rate,
-        src_pad_idx,
-        pos_pad_idx,
-        enc_input_layers, )
+        enc_inputs, )

-    dec_input_layers = make_inputs(
+    dec_inputs = make_inputs(
        decoder_input_data_names,
        n_head,
        d_model,
-        batch_size,
        max_length,
        is_pos=True,
        slf_attn_bias_flag=True,
        src_attn_bias_flag=True,
        enc_output_flag=False,
+        data_shape_flag=True,
        slf_attn_shape_flag=True,
        src_attn_shape_flag=True)

@@ -547,9 +565,7 @@ def transformer(
        d_model,
        d_inner_hid,
        dropout_rate,
-        trg_pad_idx,
-        pos_pad_idx,
-        dec_input_layers,
+        dec_inputs,
        enc_output, )

    # Padding index do not contribute to the total loss. The weights is used to
@@ -558,17 +574,20 @@ def transformer(
        label_data_names,
        n_head,
        d_model,
-        batch_size,
        max_length,
        is_pos=False,
        slf_attn_bias_flag=False,
        src_attn_bias_flag=False,
        enc_output_flag=False,
+        data_shape_flag=False,
        slf_attn_shape_flag=False,
        src_attn_shape_flag=False)
    cost = layers.softmax_with_cross_entropy(logits=predict, label=gold)
    weighted_cost = cost * weights
-    return layers.reduce_sum(weighted_cost), predict
+    sum_cost = layers.reduce_sum(weighted_cost)
+    token_num = layers.reduce_sum(weights)
+    avg_cost = sum_cost / token_num
+    return sum_cost, avg_cost, predict, token_num


 def wrap_encoder(src_vocab_size,
@@ -580,38 +599,38 @@ def wrap_encoder(src_vocab_size,
                 d_model,
                 d_inner_hid,
                 dropout_rate,
-                 src_pad_idx,
-                 pos_pad_idx,
-                 enc_input_layers=None):
+                 enc_inputs=None):
    """
    The wrapper assembles together all needed layers for the encoder.
    """
-    if enc_input_layers is None:
+    if enc_inputs is None:
        # This is used to implement independent encoder program in inference.
-        src_word, src_pos, src_slf_attn_bias, slf_attn_pre_softmax_shape, \
-            slf_attn_post_softmax_shape = make_inputs(
+        src_word, src_pos, src_slf_attn_bias, src_data_shape, \
+            slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
+            make_inputs(
                encoder_input_data_names,
                n_head,
                d_model,
-                batch_size,
                max_length,
                is_pos=True,
                slf_attn_bias_flag=True,
                src_attn_bias_flag=False,
                enc_output_flag=False,
+                data_shape_flag=True,
                slf_attn_shape_flag=True,
                src_attn_shape_flag=False)
    else:
-        src_word, src_pos, src_slf_attn_bias, slf_attn_pre_softmax_shape, \
-            slf_attn_post_softmax_shape = enc_input_layers
+        src_word, src_pos, src_slf_attn_bias, src_data_shape, \
+            slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape = \
+            enc_inputs
    enc_input = prepare_encoder(
        src_word,
        src_pos,
        src_vocab_size,
        d_model,
-        src_pad_idx,
        max_length,
-        dropout_rate, )
+        dropout_rate,
+        src_data_shape, )
    enc_output = encoder(
        enc_input,
        src_slf_attn_bias,
@@ -636,44 +655,42 @@ def wrap_decoder(trg_vocab_size,
                 d_model,
                 d_inner_hid,
                 dropout_rate,
-                 trg_pad_idx,
-                 pos_pad_idx,
-                 dec_input_layers=None,
+                 dec_inputs=None,
                 enc_output=None):
    """
    The wrapper assembles together all needed layers for the decoder.
    """
-    if dec_input_layers is None:
+    if dec_inputs is None:
        # This is used to implement independent decoder program in inference.
        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
-            slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, \
-            src_attn_pre_softmax_shape, src_attn_post_softmax_shape, \
-            enc_output = make_inputs(
+            trg_data_shape, slf_attn_pre_softmax_shape, \
+            slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
+            src_attn_post_softmax_shape, enc_output = make_inputs(
                decoder_input_data_names,
                n_head,
                d_model,
-                batch_size,
                max_length,
                is_pos=True,
                slf_attn_bias_flag=True,
                src_attn_bias_flag=True,
                enc_output_flag=True,
+                data_shape_flag=True,
                slf_attn_shape_flag=True,
                src_attn_shape_flag=True)
    else:
        trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \
-            slf_attn_pre_softmax_shape, slf_attn_post_softmax_shape, \
-            src_attn_pre_softmax_shape, src_attn_post_softmax_shape = \
-                dec_input_layers
+            trg_data_shape, slf_attn_pre_softmax_shape, \
+            slf_attn_post_softmax_shape, src_attn_pre_softmax_shape, \
+            src_attn_post_softmax_shape = dec_inputs

    dec_input = prepare_decoder(
        trg_word,
        trg_pos,
        trg_vocab_size,
        d_model,
-        trg_pad_idx,
        max_length,
-        dropout_rate, )
+        dropout_rate,
+        trg_data_shape, )
    dec_output = decoder(
        dec_input,
        enc_output,
@@ -697,5 +714,5 @@ def wrap_decoder(trg_vocab_size,
                    bias_attr=False,
                    num_flatten_dims=2),
        shape=[-1, trg_vocab_size],
-        act="softmax" if dec_input_layers is None else None)
+        act="softmax" if dec_inputs is None else None)
    return predict
--- a/fluid/neural_machine_translation/transformer/train.py
+++ b/fluid/neural_machine_translation/transformer/train.py
 import os
+import time
 import numpy as np

 import paddle
@@ -14,7 +15,7 @@ def pad_batch_data(insts,
                   pad_idx,
                   n_head,
                   is_target=False,
-                   return_pos=True,
+                   is_label=False,
                   return_attn_bias=True,
                   return_max_len=True):
    """
@@ -23,14 +24,20 @@ def pad_batch_data(insts,
    """
    return_list = []
    max_len = max(len(inst) for inst in insts)
+    # Any token included in dict can be used to pad, since the paddings' loss
+    # will be masked out by weights and make no effect on parameter gradients.
    inst_data = np.array(
        [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
    return_list += [inst_data.astype("int64").reshape([-1, 1])]
-    if return_pos:
-        inst_pos = np.array([[
-            pos_i + 1 if w_i != pad_idx else 0 for pos_i, w_i in enumerate(inst)
-        ] for inst in inst_data])
-
+    if is_label:  # label weight
+        inst_weight = np.array(
+            [[1.] * len(inst) + [0.] * (max_len - len(inst)) for inst in insts])
+        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
+    else:  # position data
+        inst_pos = np.array([
+            range(1, len(inst) + 1) + [0] * (max_len - len(inst))
+            for inst in insts
+        ])
        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
    if return_attn_bias:
        if is_target:
@@ -56,7 +63,7 @@ def pad_batch_data(insts,


 def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
-                        max_length, n_head):
+                        n_head, d_model):
    """
    Put all padded data needed by training into a dict.
    """
@@ -66,6 +73,10 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
        [inst[1] for inst in insts], trg_pad_idx, n_head, is_target=True)
    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
                                [1, 1, trg_max_len, 1]).astype("float32")
+
+    # These shape tensors are used in reshape_op.
+    src_data_shape = np.array([len(insts), src_max_len, d_model], dtype="int32")
+    trg_data_shape = np.array([len(insts), trg_max_len, d_model], dtype="int32")
    src_slf_attn_pre_softmax_shape = np.array(
        [-1, src_slf_attn_bias.shape[-1]], dtype="int32")
    src_slf_attn_post_softmax_shape = np.array(
@@ -78,17 +89,24 @@ def prepare_batch_input(insts, input_data_names, src_pad_idx, trg_pad_idx,
        [-1, trg_src_attn_bias.shape[-1]], dtype="int32")
    trg_src_attn_post_softmax_shape = np.array(
        trg_src_attn_bias.shape, dtype="int32")
-    lbl_word = pad_batch_data([inst[2] for inst in insts], trg_pad_idx, n_head,
-                              False, False, False, False)
-    lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1])
+
+    lbl_word, lbl_weight = pad_batch_data(
+        [inst[2] for inst in insts],
+        trg_pad_idx,
+        n_head,
+        is_target=False,
+        is_label=True,
+        return_attn_bias=False,
+        return_max_len=False)
+
    input_dict = dict(
        zip(input_data_names, [
-            src_word, src_pos, src_slf_attn_bias,
+            src_word, src_pos, src_slf_attn_bias, src_data_shape,
            src_slf_attn_pre_softmax_shape, src_slf_attn_post_softmax_shape,
            trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias,
-            trg_slf_attn_pre_softmax_shape, trg_slf_attn_post_softmax_shape,
-            trg_src_attn_pre_softmax_shape, trg_src_attn_post_softmax_shape,
-            lbl_word, lbl_weight
+            trg_data_shape, trg_slf_attn_pre_softmax_shape,
+            trg_slf_attn_post_softmax_shape, trg_src_attn_pre_softmax_shape,
+            trg_src_attn_post_softmax_shape, lbl_word, lbl_weight
        ]))
    return input_dict

@@ -97,14 +115,12 @@ def main():
    place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

-    cost, predict = transformer(
-        ModelHyperParams.src_vocab_size + 1,
-        ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1,
-        ModelHyperParams.n_layer, ModelHyperParams.n_head,
-        ModelHyperParams.d_key, ModelHyperParams.d_value,
-        ModelHyperParams.d_model, ModelHyperParams.d_inner_hid,
-        ModelHyperParams.dropout, ModelHyperParams.src_pad_idx,
-        ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx)
+    sum_cost, avg_cost, predict, token_num = transformer(
+        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
+        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
+        ModelHyperParams.n_head, ModelHyperParams.d_key,
+        ModelHyperParams.d_value, ModelHyperParams.d_model,
+        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout)

    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
                                         TrainTaskConfig.warmup_steps, place,
@@ -114,7 +130,7 @@ def main():
        beta1=TrainTaskConfig.beta1,
        beta2=TrainTaskConfig.beta2,
        epsilon=TrainTaskConfig.eps)
-    optimizer.minimize(cost)
+    optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost)

    train_data = paddle.batch(
        paddle.reader.shuffle(
@@ -126,27 +142,31 @@ def main():
    # Program to do validation.
    test_program = fluid.default_main_program().clone()
    with fluid.program_guard(test_program):
-        test_program = fluid.io.get_inference_program([cost])
+        test_program = fluid.io.get_inference_program([avg_cost])
    val_data = paddle.batch(
        paddle.dataset.wmt16.validation(ModelHyperParams.src_vocab_size,
                                        ModelHyperParams.trg_vocab_size),
        batch_size=TrainTaskConfig.batch_size)

    def test(exe):
-        test_costs = []
+        test_total_cost = 0
+        test_total_token = 0
        for batch_id, data in enumerate(val_data()):
-            if len(data) != TrainTaskConfig.batch_size:
-                continue
            data_input = prepare_batch_input(
                data, encoder_input_data_names + decoder_input_data_names[:-1] +
-                label_data_names, ModelHyperParams.src_pad_idx,
-                ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length,
-                ModelHyperParams.n_head)
-            test_cost = exe.run(test_program,
-                                feed=data_input,
-                                fetch_list=[cost])[0]
-            test_costs.append(test_cost)
-        return np.mean(test_costs)
+                label_data_names, ModelHyperParams.eos_idx,
+                ModelHyperParams.eos_idx, ModelHyperParams.n_head,
+                ModelHyperParams.d_model)
+            test_sum_cost, test_token_num = exe.run(
+                test_program,
+                feed=data_input,
+                fetch_list=[sum_cost, token_num],
+                use_program_cache=True)
+            test_total_cost += test_sum_cost
+            test_total_token += test_token_num
+        test_avg_cost = test_total_cost / test_total_token
+        test_ppl = np.exp([min(test_avg_cost, 100)])
+        return test_avg_cost, test_ppl

    # Initialize the parameters.
    exe.run(fluid.framework.default_startup_program())
@@ -158,27 +178,30 @@ def main():
                                   ModelHyperParams.d_model), place)

    for pass_id in xrange(TrainTaskConfig.pass_num):
+        pass_start_time = time.time()
        for batch_id, data in enumerate(train_data()):
-            # The current program desc is coupled with batch_size, thus all
-            # mini-batches must have the same number of instances currently.
            if len(data) != TrainTaskConfig.batch_size:
                continue
            data_input = prepare_batch_input(
                data, encoder_input_data_names + decoder_input_data_names[:-1] +
-                label_data_names, ModelHyperParams.src_pad_idx,
-                ModelHyperParams.trg_pad_idx, ModelHyperParams.max_length,
-                ModelHyperParams.n_head)
+                label_data_names, ModelHyperParams.eos_idx,
+                ModelHyperParams.eos_idx, ModelHyperParams.n_head,
+                ModelHyperParams.d_model)
            lr_scheduler.update_learning_rate(data_input)
            outs = exe.run(fluid.framework.default_main_program(),
                           feed=data_input,
-                           fetch_list=[cost],
+                           fetch_list=[sum_cost, avg_cost],
                           use_program_cache=True)
-            cost_val = np.array(outs[0])
-            print("pass_id = " + str(pass_id) + " batch = " + str(batch_id) +
-                  " cost = " + str(cost_val))
+            sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1])
+            print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
+                  (pass_id, batch_id, sum_cost_val, avg_cost_val,
+                   np.exp([min(avg_cost_val[0], 100)])))
        # Validate and save the model for inference.
-        val_cost = test(exe)
-        print("pass_id = " + str(pass_id) + " val_cost = " + str(val_cost))
+        val_avg_cost, val_ppl = test(exe)
+        pass_end_time = time.time()
+        time_consumed = pass_end_time - pass_start_time
+        print("epoch: %d, val avg loss: %f, val ppl: %f, "
+              "consumed %fs" % (pass_id, val_avg_cost, val_ppl, time_consumed))
        fluid.io.save_inference_model(
            os.path.join(TrainTaskConfig.model_dir,
                         "pass_" + str(pass_id) + ".infer.model"),

--- a/fluid/object_detection/.gitignore
+++ b/fluid/object_detection/.gitignore
+./data/pascalvoc/VOCdevkit/
+data/pascalvoc/test.txt
+data/pascalvoc/trainval.txt
+pretrained/ssd_mobilenet_v1_coco.tar.gz
+pretrained/ssd_mobilenet_v1_coco
+pretrained/mobilenet_v1_imagenet.tar.gz
+pretrained/mobilenet_v1_imagenet
+log*
--- a/fluid/object_detection/data/prepare_voc_data.py
+++ b/fluid/object_detection/data/prepare_voc_data.py
@@ -60,4 +60,5 @@ def prepare_filelist(devkit_dir, years, output_dir):
            ftest.write(item[0] + ' ' + item[1] + '\n')


-prepare_filelist(devkit_dir, years, '.')
+if __name__ == '__main__':
+    prepare_filelist(devkit_dir, years, '.')
--- a/fluid/object_detection/data/pascalvoc/download.sh
+++ b/fluid/object_detection/data/pascalvoc/download.sh
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd "$DIR"
+
+# Download the data.
+echo "Downloading..."
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
+wget http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
+# Extract the data.
+echo "Extractint..."
+tar -xf VOCtrainval_11-May-2012.tar
+tar -xf VOCtrainval_06-Nov-2007.tar
+tar -xf VOCtest_06-Nov-2007.tar
+
+echo "Creating data lists..."
+python create_list.py
--- a/fluid/object_detection/data/label_list
+++ b/fluid/object_detection/data/label_list
--- a/fluid/object_detection/image_util.py
+++ b/fluid/object_detection/image_util.py
@@ -85,8 +85,7 @@ def satisfy_sample_constraint(sampler, sample_bbox, bbox_labels):
    return False


-def generate_batch_samples(batch_sampler, bbox_labels, image_width,
-                           image_height):
+def generate_batch_samples(batch_sampler, bbox_labels):
    sampled_bbox = []
    index = []
    c = 0
@@ -217,8 +216,8 @@ def distort_image(img, settings):
 def expand_image(img, bbox_labels, img_width, img_height, settings):
    prob = random.uniform(0, 1)
    if prob < settings._expand_prob:
-        expand_ratio = random.uniform(1, settings._expand_max_ratio)
-        if expand_ratio - 1 >= 0.01:
+        if _expand_max_ratio - 1 >= 0.01:
+            expand_ratio = random.uniform(1, settings._expand_max_ratio)
            height = int(img_height * expand_ratio)
            width = int(img_width * expand_ratio)
            h_off = math.floor(random.uniform(0, height - img_height))
@@ -231,5 +230,5 @@ def expand_image(img, bbox_labels, img_width, img_height, settings):
            expand_img = Image.fromarray(expand_img)
            expand_img.paste(img, (int(w_off), int(h_off)))
            bbox_labels = transform_labels(bbox_labels, expand_bbox)
-            return expand_img, bbox_labels
-    return img, bbox_labels
+            return expand_img, bbox_labels, width, height
+    return img, bbox_labels, img_width, img_height
--- a/fluid/object_detection/load_model.py
+++ b/fluid/object_detection/load_model.py
-import paddle.v2 as paddle
-import paddle.fluid as fluid
-import numpy as np
-
-
-# From npy
-def load_vars():
-    vars = {}
-    name_map = {}
-    with open('./ssd_mobilenet_v1_coco/names.map', 'r') as map_file:
-        for param in map_file:
-            fd_name, tf_name = param.strip().split('\t')
-            name_map[fd_name] = tf_name
-
-    tf_vars = np.load(
-        './ssd_mobilenet_v1_coco/ssd_mobilenet_v1_coco_2017_11_17.npy').item()
-    for fd_name in name_map:
-        tf_name = name_map[fd_name]
-        tf_var = tf_vars[tf_name]
-        if len(tf_var.shape) == 4 and 'depthwise' in tf_name:
-            vars[fd_name] = np.transpose(tf_var, (2, 3, 0, 1))
-        elif len(tf_var.shape) == 4:
-            vars[fd_name] = np.transpose(tf_var, (3, 2, 0, 1))
-        else:
-            vars[fd_name] = tf_var
-
-    return vars
-
-
-def load_and_set_vars(place):
-    vars = load_vars()
-    for k, v in vars.items():
-        t = fluid.global_scope().find_var(k).get_tensor()
-        #print(np.array(t).shape, v.shape, k)
-        assert np.array(t).shape == v.shape
-        t.set(v, place)
-
-
-# From Paddle V1
-def load_paddlev1_vars(place):
-    vars = {}
-    name_map = {}
-    with open('./caffe2paddle/names.map', 'r') as map_file:
-        for param in map_file:
-            fd_name, tf_name = param.strip().split('\t')
-            name_map[fd_name] = tf_name
-
-    from operator import mul
-
-    def load(file_name, shape):
-        with open(file_name, 'rb') as f:
-            f.read(16)
-            arr = np.fromfile(f, dtype=np.float32)
-            #print(arr.size, reduce(mul, shape), file_name)
-            assert arr.size == reduce(mul, shape)
-            return arr.reshape(shape)
-
-    for fd_name in name_map:
-        v1_name = name_map[fd_name]
-        t = fluid.global_scope().find_var(fd_name).get_tensor()
-        shape = np.array(t).shape
-        v1_var = load('./caffe2paddle/' + v1_name, shape)
-        t.set(v1_var, place)
-
-
-if __name__ == "__main__":
-    load_vars()
--- a/fluid/object_detection/mobilenet_ssd.py
+++ b/fluid/object_detection/mobilenet_ssd.py
@@ -27,12 +27,7 @@ def conv_bn(input,
        bias_attr=False)
    parameter_attr = ParamAttr(learning_rate=0.1, initializer=MSRA())
    bias_attr = ParamAttr(learning_rate=0.2)
-    return fluid.layers.batch_norm(
-        input=conv,
-        act=act,
-        epsilon=0.00001,
-        param_attr=parameter_attr,
-        bias_attr=bias_attr)
+    return fluid.layers.batch_norm(input=conv, act=act)


 def depthwise_separable(input, num_filters1, num_filters2, num_groups, stride,
@@ -76,7 +71,7 @@ def extra_block(input, num_filters1, num_filters2, num_groups, stride, scale):
    return normal_conv


-def mobile_net(img, img_shape, scale=1.0):
+def mobile_net(num_classes, img, img_shape, scale=1.0):
    # 300x300
    tmp = conv_bn(img, 3, int(32 * scale), 2, 1, 3)
    # 150x150
@@ -104,10 +99,11 @@ def mobile_net(img, img_shape, scale=1.0):
    module16 = extra_block(module15, 128, 256, 1, 2, scale)
    # 2x2
    module17 = extra_block(module16, 64, 128, 1, 2, scale)
+
    mbox_locs, mbox_confs, box, box_var = fluid.layers.multi_box_head(
        inputs=[module11, module13, module14, module15, module16, module17],
        image=img,
-        num_classes=21,
+        num_classes=num_classes,
        min_ratio=20,
        max_ratio=90,
        min_sizes=[60.0, 105.0, 150.0, 195.0, 240.0, 285.0],

--- a/fluid/object_detection/pretrained/download_coco.sh
+++ b/fluid/object_detection/pretrained/download_coco.sh
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd "$DIR"
+
+# Download the data.
+echo "Downloading..."
+wget http://paddlemodels.bj.bcebos.com/ssd_mobilenet_v1_coco.tar.gz
+echo "Extractint..."
+tar -xf ssd_mobilenet_v1_coco.tar.gz
--- a/fluid/object_detection/pretrained/download_imagenet.sh
+++ b/fluid/object_detection/pretrained/download_imagenet.sh
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd "$DIR"
+
+# Download the data.
+echo "Downloading..."
+wget http://paddlemodels.bj.bcebos.com/mobilenet_v1_imagenet.tar.gz
+echo "Extractint..."
+tar -xf mobilenet_v1_imagenet.tar.gz
--- a/fluid/object_detection/reader.py
+++ b/fluid/object_detection/reader.py
@@ -16,19 +16,25 @@ import image_util
 from paddle.utils.image_util import *
 import random
 from PIL import Image
+from PIL import ImageDraw
 import numpy as np
 import xml.etree.ElementTree
 import os
+import time
+import copy


 class Settings(object):
-    def __init__(self, data_dir, label_file, resize_h, resize_w, mean_value,
-                 apply_distort, apply_expand):
+    def __init__(self, dataset, toy, data_dir, label_file, resize_h, resize_w,
+                 mean_value, apply_distort, apply_expand):
+        self._dataset = dataset
+        self._toy = toy
        self._data_dir = data_dir
-        self._label_list = []
-        label_fpath = os.path.join(data_dir, label_file)
-        for line in open(label_fpath):
-            self._label_list.append(line.strip())
+        if dataset == "pascalvoc":
+            self._label_list = []
+            label_fpath = os.path.join(data_dir, label_file)
+            for line in open(label_fpath):
+                self._label_list.append(line.strip())

        self._apply_distort = apply_distort
        self._apply_expand = apply_expand
@@ -47,6 +53,14 @@ class Settings(object):
        self._brightness_prob = 0.5
        self._brightness_delta = 0.125

+    @property
+    def dataset(self):
+        return self._dataset
+
+    @property
+    def toy(self):
+        return self._toy
+
    @property
    def apply_distort(self):
        return self._apply_expand
@@ -59,6 +73,10 @@ class Settings(object):
    def data_dir(self):
        return self._data_dir

+    @data_dir.setter
+    def data_dir(self, data_dir):
+        self._data_dir = data_dir
+
    @property
    def label_list(self):
        return self._label_list
@@ -78,23 +96,76 @@ class Settings(object):

 def _reader_creator(settings, file_list, mode, shuffle):
    def reader():
-        with open(file_list) as flist:
-            lines = [line.strip() for line in flist]
-            if shuffle:
-                random.shuffle(lines)
-            for line in lines:
+        if settings.dataset == 'coco':
+            # cocoapi 
+            from pycocotools.coco import COCO
+            from pycocotools.cocoeval import COCOeval
+
+            coco = COCO(file_list)
+            image_ids = coco.getImgIds()
+            images = coco.loadImgs(image_ids)
+            category_ids = coco.getCatIds()
+            category_names = [
+                item['name'] for item in coco.loadCats(category_ids)
+            ]
+        elif settings.dataset == 'pascalvoc':
+            flist = open(file_list)
+            images = [line.strip() for line in flist]
+
+        if not settings.toy == 0:
+            images = images[:settings.toy] if len(
+                images) > settings.toy else images
+        print("{} on {} with {} images".format(mode, settings.dataset,
+                                               len(images)))
+
+        if shuffle:
+            random.shuffle(images)
+
+        for image in images:
+            if settings.dataset == 'coco':
+                image_name = image['file_name']
+                image_path = os.path.join(settings.data_dir, image_name)
+            elif settings.dataset == 'pascalvoc':
                if mode == 'train' or mode == 'test':
-                    img_path, label_path = line.split()
-                    img_path = os.path.join(settings.data_dir, img_path)
+                    image_path, label_path = image.split()
+                    image_path = os.path.join(settings.data_dir, image_path)
                    label_path = os.path.join(settings.data_dir, label_path)
                elif mode == 'infer':
-                    img_path = os.path.join(settings.data_dir, line)
+                    image_path = os.path.join(settings.data_dir, image)

-                img = Image.open(img_path)
-                img_width, img_height = img.size
+            img = Image.open(image_path)
+            if img.mode == 'L':
+                img = img.convert('RGB')
+            img_width, img_height = img.size

-                # layout: label | xmin | ymin | xmax | ymax | difficult
-                if mode == 'train' or mode == 'test':
+            if mode == 'train' or mode == 'test':
+                if settings.dataset == 'coco':
+                    # layout: category_id | xmin | ymin | xmax | ymax | iscrowd | origin_coco_bbox | segmentation | area | image_id | annotation_id
+                    bbox_labels = []
+                    annIds = coco.getAnnIds(imgIds=image['id'])
+                    anns = coco.loadAnns(annIds)
+                    for ann in anns:
+                        bbox_sample = []
+                        # start from 1, leave 0 to background
+                        bbox_sample.append(
+                            float(category_ids.index(ann['category_id'])) + 1)
+                        bbox = ann['bbox']
+                        xmin, ymin, w, h = bbox
+                        xmax = xmin + w
+                        ymax = ymin + h
+                        bbox_sample.append(float(xmin) / img_width)
+                        bbox_sample.append(float(ymin) / img_height)
+                        bbox_sample.append(float(xmax) / img_width)
+                        bbox_sample.append(float(ymax) / img_height)
+                        bbox_sample.append(float(ann['iscrowd']))
+                        #bbox_sample.append(ann['bbox'])
+                        #bbox_sample.append(ann['segmentation'])
+                        #bbox_sample.append(ann['area'])
+                        #bbox_sample.append(ann['image_id'])
+                        #bbox_sample.append(ann['id'])
+                        bbox_labels.append(bbox_sample)
+                elif settings.dataset == 'pascalvoc':
+                    # layout: label | xmin | ymin | xmax | ymax | difficult
                    bbox_labels = []
                    root = xml.etree.ElementTree.parse(label_path).getroot()
                    for object in root.findall('object'):
@@ -117,91 +188,136 @@ def _reader_creator(settings, file_list, mode, shuffle):
                        bbox_sample.append(difficult)
                        bbox_labels.append(bbox_sample)

-                    sample_labels = bbox_labels
-                    if mode == 'train':
-                        if settings._apply_distort:
-                            img = image_util.distort_image(img, settings)
-                        if settings._apply_expand:
-                            img, bbox_labels = image_util.expand_image(
-                                img, bbox_labels, img_width, img_height,
-                                settings)
-                        batch_sampler = []
-                        # hard-code here
-                        batch_sampler.append(
-                            image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9,
-                                               0.0))
-                        batch_sampler.append(
-                            image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0,
-                                               1.0))
-                        """ random crop """
-                        sampled_bbox = image_util.generate_batch_samples(
-                            batch_sampler, bbox_labels, img_width, img_height)
-
-                        img = np.array(img)
-                        if len(sampled_bbox) > 0:
-                            idx = int(random.uniform(0, len(sampled_bbox)))
-                            img, sample_labels = image_util.crop_image(
-                                img, bbox_labels, sampled_bbox[idx], img_width,
-                                img_height)
-
-                        img = Image.fromarray(img)
-                img = img.resize((settings.resize_w, settings.resize_h),
-                                 Image.ANTIALIAS)
-                img = np.array(img)
-
+                sample_labels = bbox_labels
                if mode == 'train':
-                    mirror = int(random.uniform(0, 2))
-                    if mirror == 1:
-                        img = img[:, ::-1, :]
-                        for i in xrange(len(sample_labels)):
-                            tmp = sample_labels[i][1]
-                            sample_labels[i][1] = 1 - sample_labels[i][3]
-                            sample_labels[i][3] = 1 - tmp
-
-                if len(img.shape) == 3:
-                    img = np.swapaxes(img, 1, 2)
-                    img = np.swapaxes(img, 1, 0)
-
-                img = img[[2, 1, 0], :, :]
-                img = img.astype('float32')
-                img -= settings.img_mean
-                img = img.flatten()
-                img = img * 0.007843
-
-                sample_labels = np.array(sample_labels)
-                if mode == 'train' or mode == 'test':
-                    if mode == 'train' and len(sample_labels) == 0: continue
-                    yield img.astype(
-                        'float32'
-                    ), sample_labels[:, 1:5], sample_labels[:, 0].astype(
-                        'int32'), sample_labels[:, -1].astype('int32')
-                elif mode == 'infer':
-                    yield img.astype('float32')
+                    if settings._apply_distort:
+                        img = image_util.distort_image(img, settings)
+                    if settings._apply_expand:
+                        img, bbox_labels, img_width, img_height = image_util.expand_image(
+                            img, bbox_labels, img_width, img_height, settings)
+                    batch_sampler = []
+                    # hard-code here
+                    batch_sampler.append(
+                        image_util.sampler(1, 1, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0))
+                    batch_sampler.append(
+                        image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.1, 0.0))
+                    batch_sampler.append(
+                        image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.3, 0.0))
+                    batch_sampler.append(
+                        image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.5, 0.0))
+                    batch_sampler.append(
+                        image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.7, 0.0))
+                    batch_sampler.append(
+                        image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.9, 0.0))
+                    batch_sampler.append(
+                        image_util.sampler(1, 50, 0.3, 1.0, 0.5, 2.0, 0.0, 1.0))
+                    """ random crop """
+                    sampled_bbox = image_util.generate_batch_samples(
+                        batch_sampler, bbox_labels, img_width, img_height)
+
+                    img = np.array(img)
+                    if len(sampled_bbox) > 0:
+                        idx = int(random.uniform(0, len(sampled_bbox)))
+                        img, sample_labels = image_util.crop_image(
+                            img, bbox_labels, sampled_bbox[idx], img_width,
+                            img_height)
+
+                    img = Image.fromarray(img)
+            img = img.resize((settings.resize_w, settings.resize_h),
+                             Image.ANTIALIAS)
+            img = np.array(img)
+
+            if mode == 'train':
+                mirror = int(random.uniform(0, 2))
+                if mirror == 1:
+                    img = img[:, ::-1, :]
+                    for i in xrange(len(sample_labels)):
+                        tmp = sample_labels[i][1]
+                        sample_labels[i][1] = 1 - sample_labels[i][3]
+                        sample_labels[i][3] = 1 - tmp
+
+            # HWC to CHW
+            if len(img.shape) == 3:
+                img = np.swapaxes(img, 1, 2)
+                img = np.swapaxes(img, 1, 0)
+            # RBG to BGR
+            img = img[[2, 1, 0], :, :]
+            img = img.astype('float32')
+            img -= settings.img_mean
+            img = img.flatten()
+            img = img * 0.007843
+
+            sample_labels = np.array(sample_labels)
+            if mode == 'train' or mode == 'test':
+                if mode == 'train' and len(sample_labels) == 0: continue
+                if mode == 'test' and len(sample_labels) == 0: continue
+                yield img.astype(
+                    'float32'
+                ), sample_labels[:, 1:5], sample_labels[:, 0].astype(
+                    'int32'), sample_labels[:, -1].astype('int32')
+            elif mode == 'infer':
+                yield img.astype('float32')

    return reader


+def draw_bounding_box_on_image(image,
+                               sample_labels,
+                               image_name,
+                               category_names,
+                               color='red',
+                               thickness=4,
+                               with_text=True,
+                               normalized=True):
+    image = Image.fromarray(image)
+    draw = ImageDraw.Draw(image)
+    im_width, im_height = image.size
+    if not normalized:
+        im_width, im_height = 1, 1
+    for item in sample_labels:
+        label = item[0]
+        category_name = category_names[int(label)]
+        bbox = item[1:5]
+        xmin, ymin, xmax, ymax = bbox
+        (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
+                                      ymin * im_height, ymax * im_height)
+        draw.line(
+            [(left, top), (left, bottom), (right, bottom), (right, top),
+             (left, top)],
+            width=thickness,
+            fill=color)
+        if with_text:
+            if image.mode == 'RGB':
+                draw.text((left, top), category_name, (255, 255, 0))
+    image.save(image_name)
+
+
 def train(settings, file_list, shuffle=True):
-    return _reader_creator(settings, file_list, 'train', shuffle)
+    file_list = os.path.join(settings.data_dir, file_list)
+    if settings.dataset == 'coco':
+        train_settings = copy.copy(settings)
+        if '2014' in file_list:
+            sub_dir = "train2014"
+        elif '2017' in file_list:
+            sub_dir = "train2017"
+        train_settings.data_dir = os.path.join(settings.data_dir, sub_dir)
+        return _reader_creator(train_settings, file_list, 'train', shuffle)
+    elif settings.dataset == 'pascalvoc':
+        return _reader_creator(settings, file_list, 'train', shuffle)


 def test(settings, file_list):
-    return _reader_creator(settings, file_list, 'test', False)
+    file_list = os.path.join(settings.data_dir, file_list)
+    if settings.dataset == 'coco':
+        test_settings = copy.copy(settings)
+        if '2014' in file_list:
+            sub_dir = "val2014"
+        elif '2017' in file_list:
+            sub_dir = "val2017"
+        test_settings.data_dir = os.path.join(settings.data_dir, sub_dir)
+        return _reader_creator(test_settings, file_list, 'test', False)
+    elif settings.dataset == 'pascalvoc':
+        return _reader_creator(settings, file_list, 'test', False)


 def infer(settings, file_list):

--- a/fluid/object_detection/train.py
+++ b/fluid/object_detection/train.py
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 import reader
 import load_model as load_model
 from mobilenet_ssd import mobile_net
 from utility import add_arguments, print_arguments
 import os
+import time
 import numpy as np
 import argparse
 import functools
@@ -12,22 +13,40 @@ import functools
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('batch_size',   int,    32,       "Minibatch size.")
-add_arg('parallel',     bool,   True,     "Whether use parallel training.")
-add_arg('use_gpu',      bool,   True,     "Whether use GPU.")
+add_arg('learning_rate',    float, 0.001,     "Learning rate.")
+add_arg('batch_size',       int,   32,        "Minibatch size.")
+add_arg('num_passes',       int,   25,        "Epoch number.")
+add_arg('parallel',         bool,  True,      "Whether use parallel training.")
+add_arg('use_gpu',          bool,  True,      "Whether use GPU.")
+add_arg('use_nccl',         bool,  False,     "Whether use NCCL.")
+add_arg('dataset',          str, 'pascalvoc', "coco or pascalvoc.")
+add_arg('model_save_dir',   str, 'model',     "The path to save model.")
+add_arg('pretrained_model', str, 'pretrained/ssd_mobilenet_v1_coco/', "The init model path.")
+add_arg('apply_distort',    bool, True,   "Whether apply distort")
+add_arg('apply_expand',     bool, False,  "Whether appley expand")
+add_arg('resize_h',         int,  300,    "resize image size")
+add_arg('resize_w',         int,  300,    "resize image size")
+add_arg('mean_value_B',     float, 127.5, "mean value which will be subtracted")  #123.68
+add_arg('mean_value_G',     float, 127.5, "mean value which will be subtracted")  #116.78
+add_arg('mean_value_R',     float, 127.5, "mean value which will be subtracted")  #103.94
+add_arg('is_toy',           int, 0, "Toy for quick debug, 0 means using all data, while n means using only n sample")
 # yapf: disable


-def train(args,
-          train_file_list,
-          val_file_list,
-          data_args,
-          learning_rate,
-          batch_size,
-          num_passes,
-          model_save_dir='model',
-          init_model_path=None):
+def parallel_do(args,
+                train_file_list,
+                val_file_list,
+                data_args,
+                learning_rate,
+                batch_size,
+                num_passes,
+                model_save_dir,
+                pretrained_model=None):
    image_shape = [3, data_args.resize_h, data_args.resize_w]
+    if data_args.dataset == 'coco':
+        num_classes = 81
+    elif data_args.dataset == 'pascalvoc':
+        num_classes = 21

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    gt_box = fluid.layers.data(
@@ -39,15 +58,16 @@ def train(args,

    if args.parallel:
        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
+        pd = fluid.layers.ParallelDo(places, use_nccl=args.use_nccl)
        with pd.do():
            image_ = pd.read_input(image)
            gt_box_ = pd.read_input(gt_box)
            gt_label_ = pd.read_input(gt_label)
            difficult_ = pd.read_input(difficult)
-            locs, confs, box, box_var = mobile_net(image_, image_shape)
-            loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_,
-                                         box, box_var)
+            locs, confs, box, box_var = mobile_net(num_classes, image_,
+                                                   image_shape)
+            loss = fluid.layers.ssd_loss(locs, confs, gt_box_, gt_label_, box,
+                                         box_var)
            nmsed_out = fluid.layers.detection_output(
                locs, confs, box, box_var, nms_threshold=0.45)
            loss = fluid.layers.reduce_sum(loss)
@@ -57,11 +77,11 @@ def train(args,
        loss, nmsed_out = pd()
        loss = fluid.layers.mean(loss)
    else:
-        locs, confs, box, box_var = mobile_net(image, image_shape)
+        locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
        nmsed_out = fluid.layers.detection_output(
            locs, confs, box, box_var, nms_threshold=0.45)
-        loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label,
-                                     box, box_var)
+        loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
+                                     box_var)
        loss = fluid.layers.reduce_sum(loss)

    test_program = fluid.default_main_program().clone(for_test=True)
@@ -71,13 +91,20 @@ def train(args,
            gt_label,
            gt_box,
            difficult,
-            21,
+            num_classes,
            overlap_threshold=0.5,
            evaluate_difficult=False,
-            ap_version='11point')
+            ap_version='integral')

-    boundaries = [40000, 60000]
-    values = [0.001, 0.0005, 0.00025]
+    if data_args.dataset == 'coco':
+        # learning rate decay in 12, 19 pass, respectively
+        if '2014' in train_file_list:
+            boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19]
+        elif '2017' in train_file_list:
+            boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19]
+    elif data_args.dataset == 'pascalvoc':
+        boundaries = [40000, 60000]
+    values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25]
    optimizer = fluid.optimizer.RMSProp(
        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
        regularization=fluid.regularizer.L2Decay(0.00005), )
@@ -88,8 +115,11 @@ def train(args,
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

-    load_model.load_and_set_vars(place)
-    #load_model.load_paddlev1_vars(place)
+    if pretrained_model:
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+
    train_reader = paddle.batch(
        reader.train(data_args, train_file_list), batch_size=batch_size)
    test_reader = paddle.batch(
@@ -108,37 +138,167 @@ def train(args,
        print("Test {0}, map {1}".format(pass_id, test_map[0]))

    for pass_id in range(num_passes):
+        start_time = time.time()
+        prev_start_time = start_time
+        end_time = 0
        for batch_id, data in enumerate(train_reader()):
+            prev_start_time = start_time
+            start_time = time.time()
            loss_v = exe.run(fluid.default_main_program(),
                             feed=feeder.feed(data),
                             fetch_list=[loss])
+            end_time = time.time()
            if batch_id % 20 == 0:
-                print("Pass {0}, batch {1}, loss {2}"
-                      .format(pass_id, batch_id, loss_v[0]))
+                print("Pass {0}, batch {1}, loss {2}, time {3}".format(
+                    pass_id, batch_id, loss_v[0], start_time - prev_start_time))
        test(pass_id)

-        if pass_id % 10 == 0:
+        if pass_id % 10 == 0 or pass_id == num_passes - 1:
            model_path = os.path.join(model_save_dir, str(pass_id))
            print 'save models to %s' % (model_path)
-            fluid.io.save_inference_model(model_path, ['image'], [nmsed_out],
-                                          exe)
+            fluid.io.save_persistables(exe, model_path)
+
+
+def parallel_exe(args,
+                 train_file_list,
+                 val_file_list,
+                 data_args,
+                 learning_rate,
+                 batch_size,
+                 num_passes,
+                 model_save_dir='model',
+                 pretrained_model=None):
+    image_shape = [3, data_args.resize_h, data_args.resize_w]
+    if data_args.dataset == 'coco':
+        num_classes = 81
+    elif data_args.dataset == 'pascalvoc':
+        num_classes = 21
+
+    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
+    gt_box = fluid.layers.data(
+        name='gt_box', shape=[4], dtype='float32', lod_level=1)
+    gt_label = fluid.layers.data(
+        name='gt_label', shape=[1], dtype='int32', lod_level=1)
+    difficult = fluid.layers.data(
+        name='gt_difficult', shape=[1], dtype='int32', lod_level=1)

+    locs, confs, box, box_var = mobile_net(num_classes, image, image_shape)
+    nmsed_out = fluid.layers.detection_output(
+        locs, confs, box, box_var, nms_threshold=0.45)
+    loss = fluid.layers.ssd_loss(locs, confs, gt_box, gt_label, box,
+                                 box_var)
+    loss = fluid.layers.reduce_sum(loss)
+
+    test_program = fluid.default_main_program().clone(for_test=True)
+    with fluid.program_guard(test_program):
+        map_eval = fluid.evaluator.DetectionMAP(
+            nmsed_out,
+            gt_label,
+            gt_box,
+            difficult,
+            num_classes,
+            overlap_threshold=0.5,
+            evaluate_difficult=False,
+            ap_version='integral')
+
+    if data_args.dataset == 'coco':
+        # learning rate decay in 12, 19 pass, respectively
+        if '2014' in train_file_list:
+            boundaries = [82783 / batch_size * 12, 82783 / batch_size * 19]
+        elif '2017' in train_file_list:
+            boundaries = [118287 / batch_size * 12, 118287 / batch_size * 19]
+    elif data_args.dataset == 'pascalvoc':
+        boundaries = [40000, 60000]
+    values = [learning_rate, learning_rate * 0.5, learning_rate * 0.25]
+    optimizer = fluid.optimizer.RMSProp(
+        learning_rate=fluid.layers.piecewise_decay(boundaries, values),
+        regularization=fluid.regularizer.L2Decay(0.00005), )
+
+    optimizer.minimize(loss)
+
+    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    if pretrained_model:
+        def if_exist(var):
+            return os.path.exists(os.path.join(pretrained_model, var.name))
+        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+
+    train_exe = fluid.ParallelExecutor(use_cuda=args.use_gpu,
+                                       loss_name=loss.name)
+
+    train_reader = paddle.batch(
+        reader.train(data_args, train_file_list), batch_size=batch_size)
+    test_reader = paddle.batch(
+        reader.test(data_args, val_file_list), batch_size=batch_size)
+    feeder = fluid.DataFeeder(
+        place=place, feed_list=[image, gt_box, gt_label, difficult])
+
+    def test(pass_id):
+        _, accum_map = map_eval.get_map_var()
+        map_eval.reset(exe)
+        test_map = None
+        for _, data in enumerate(test_reader()):
+            test_map = exe.run(test_program,
+                               feed=feeder.feed(data),
+                               fetch_list=[accum_map])
+        print("Test {0}, map {1}".format(pass_id, test_map[0]))
+
+    for pass_id in range(num_passes):
+        start_time = time.time()
+        prev_start_time = start_time
+        end_time = 0
+        test(pass_id)
+        for batch_id, data in enumerate(train_reader()):
+            prev_start_time = start_time
+            start_time = time.time()
+            loss_v, = train_exe.run(fetch_list=[loss.name],
+                                   feed_dict=feeder.feed(data))
+            end_time = time.time()
+            loss_v = np.mean(np.array(loss_v))
+            if batch_id % 20 == 0:
+                print("Pass {0}, batch {1}, loss {2}, time {3}".format(
+                    pass_id, batch_id, loss_v, start_time - prev_start_time))
+
+        if pass_id % 10 == 0 or pass_id == num_passes - 1:
+            model_path = os.path.join(model_save_dir, str(pass_id))
+            print 'save models to %s' % (model_path)
+            fluid.io.save_persistables(exe, model_path)

 if __name__ == '__main__':
    args = parser.parse_args()
    print_arguments(args)
+
+    data_dir = 'data/pascalvoc'
+    train_file_list = 'trainval.txt'
+    val_file_list = 'test.txt'
+    label_file = 'label_list'
+    model_save_dir = args.model_save_dir
+    if args.dataset == 'coco':
+        data_dir = './data/COCO17'
+        train_file_list = 'annotations/instances_train2017.json'
+        val_file_list = 'annotations/instances_val2017.json'
+        label_file = 'label_list'
+
    data_args = reader.Settings(
-        data_dir='./data',
-        label_file='label_list',
-        apply_distort=True,
-        apply_expand=True,
-        resize_h=300,
-        resize_w=300,
-        mean_value=[127.5, 127.5, 127.5])
-    train(args,
-          train_file_list='./data/trainval.txt',
-          val_file_list='./data/test.txt',
-          data_args=data_args,
-          learning_rate=0.001,
-          batch_size=args.batch_size,
-          num_passes=300)
+        dataset=args.dataset,
+        toy=args.is_toy,
+        data_dir=data_dir,
+        label_file=label_file,
+        apply_distort=args.apply_distort,
+        apply_expand=args.apply_expand,
+        resize_h=args.resize_h,
+        resize_w=args.resize_w,
+        mean_value=[args.mean_value_B, args.mean_value_G, args.mean_value_R])
+    #method = parallel_do
+    method = parallel_exe
+    method(args,
+           train_file_list=train_file_list,
+           val_file_list=val_file_list,
+           data_args=data_args,
+           learning_rate=args.learning_rate,
+           batch_size=args.batch_size,
+           num_passes=args.num_passes,
+           model_save_dir=model_save_dir,
+           pretrained_model=args.pretrained_model)
--- a/fluid/policy_gradient/brain.py
+++ b/fluid/policy_gradient/brain.py
@@ -30,32 +30,28 @@ class PolicyGradient:
        acts = fluid.layers.data(name='acts', shape=[1], dtype='int64')
        vt = fluid.layers.data(name='vt', shape=[1], dtype='float32')
        # fc1
-        fc1 = fluid.layers.fc(
-            input=obs,
-            size=10,
-            act="tanh"  # tanh activation
-        )
+        fc1 = fluid.layers.fc(input=obs, size=10, act="tanh")  # tanh activation
        # fc2
-        self.all_act_prob = fluid.layers.fc(input=fc1,
-                                            size=self.n_actions,
-                                            act="softmax")
+        all_act_prob = fluid.layers.fc(input=fc1,
+                                       size=self.n_actions,
+                                       act="softmax")
+        self.inferece_program = fluid.defaul_main_program().clone()
        # to maximize total reward (log_p * R) is to minimize -(log_p * R)
        neg_log_prob = fluid.layers.cross_entropy(
            input=self.all_act_prob,
            label=acts)  # this is negative log of chosen action
        neg_log_prob_weight = fluid.layers.elementwise_mul(x=neg_log_prob, y=vt)
        loss = fluid.layers.reduce_mean(
-            x=neg_log_prob_weight)  # reward guided loss
+            neg_log_prob_weight)  # reward guided loss

        sgd_optimizer = fluid.optimizer.SGD(self.lr)
        sgd_optimizer.minimize(loss)
        self.exe.run(fluid.default_startup_program())

    def choose_action(self, observation):
-        prob_weights = self.exe.run(
-            fluid.default_main_program().prune(self.all_act_prob),
-            feed={"obs": observation[np.newaxis, :]},
-            fetch_list=[self.all_act_prob])
+        prob_weights = self.exe.run(self.inferece_program,
+                                    feed={"obs": observation[np.newaxis, :]},
+                                    fetch_list=[self.all_act_prob])
        prob_weights = np.array(prob_weights[0])
        action = np.random.choice(
            range(prob_weights.shape[1]),