Merge branch 'develop' of https://github.com/PaddlePaddle/models into dam_py3

be80e25e · Yibing Liu · 468c8269 · 8bca0e43 · be80e25e · be80e25e
340 changed file
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ PaddlePaddle provides a rich set of computational units to enable users to adopt

 - [fluid models](fluid): use PaddlePaddle's Fluid APIs. We especially recommend users to use Fluid models.

- [v2 models](v2): use PaddlePaddle's v2 APIs.
+- [legacy models](legacy): use PaddlePaddle's v2 APIs.


 ## License

--- a/fluid/DeepASR/model_utils/model.py
+++ b/fluid/DeepASR/model_utils/model.py
@@ -2,7 +2,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import paddle.v2 as paddle
 import paddle.fluid as fluid



--- a/fluid/DeepQNetwork/DuelingDQN_agent.py
+++ b/fluid/DeepQNetwork/DuelingDQN_agent.py
@@ -158,7 +158,8 @@ class DuelingDQNModel(object):
            for i, var in enumerate(policy_vars):
                sync_op = fluid.layers.assign(policy_vars[i], target_vars[i])
                sync_ops.append(sync_op)
-        sync_program = sync_program.prune(sync_ops)
+        # The prune API is deprecated, please don't use it any more.
+        sync_program = sync_program._prune(sync_ops)
        return sync_program

    def act(self, state, train_or_test):

--- a/fluid/DeepQNetwork/atari.py
+++ b/fluid/DeepQNetwork/atari.py
@@ -9,7 +9,7 @@ import gym
 from gym import spaces
 from gym.envs.atari.atari_env import ACTION_MEANING

-from ale_python_interface import ALEInterface
+from atari_py import ALEInterface

 __all__ = ['AtariPlayer']


--- a/fluid/adversarial/tutorials/mnist_model.py
+++ b/fluid/adversarial/tutorials/mnist_model.py
 """
 CNN on mnist data using fluid api of paddlepaddle
 """
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid



--- a/fluid/adversarial/tutorials/mnist_tutorial_bim.py
+++ b/fluid/adversarial/tutorials/mnist_tutorial_bim.py
@@ -8,7 +8,7 @@ sys.path.append("..")

 import matplotlib.pyplot as plt
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle

 from advbox.adversary import Adversary
 from advbox.attacks.gradient_method import BIM

--- a/fluid/adversarial/tutorials/mnist_tutorial_deepfool.py
+++ b/fluid/adversarial/tutorials/mnist_tutorial_deepfool.py
@@ -8,7 +8,7 @@ sys.path.append("..")

 import matplotlib.pyplot as plt
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle

 from advbox.adversary import Adversary
 from advbox.attacks.deepfool import DeepFoolAttack

--- a/fluid/adversarial/tutorials/mnist_tutorial_fgsm.py
+++ b/fluid/adversarial/tutorials/mnist_tutorial_fgsm.py
@@ -8,7 +8,7 @@ sys.path.append("..")
 import matplotlib.pyplot as plt
 import numpy as np
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle

 from advbox.adversary import Adversary
 from advbox.attacks.gradient_method import FGSM

--- a/fluid/adversarial/tutorials/mnist_tutorial_ilcm.py
+++ b/fluid/adversarial/tutorials/mnist_tutorial_ilcm.py
@@ -7,7 +7,7 @@ sys.path.append("..")

 import matplotlib.pyplot as plt
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle

 from advbox.adversary import Adversary
 from advbox.attacks.gradient_method import ILCM

--- a/fluid/adversarial/tutorials/mnist_tutorial_jsma.py
+++ b/fluid/adversarial/tutorials/mnist_tutorial_jsma.py
@@ -7,7 +7,7 @@ sys.path.append("..")

 import matplotlib.pyplot as plt
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle

 from advbox.adversary import Adversary
 from advbox.attacks.saliency import JSMA

--- a/fluid/adversarial/tutorials/mnist_tutorial_lbfgs.py
+++ b/fluid/adversarial/tutorials/mnist_tutorial_lbfgs.py
@@ -7,7 +7,7 @@ sys.path.append("..")

 import matplotlib.pyplot as plt
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle

 from advbox.adversary import Adversary
 from advbox.attacks.lbfgs import LBFGS

--- a/fluid/adversarial/tutorials/mnist_tutorial_mifgsm.py
+++ b/fluid/adversarial/tutorials/mnist_tutorial_mifgsm.py
@@ -9,7 +9,7 @@ sys.path.append("..")
 import matplotlib.pyplot as plt
 import numpy as np
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle

 from advbox.adversary import Adversary
 from advbox.attacks.gradient_method import MIFGSM

--- a/fluid/deep_attention_matching_net/README.md
+++ b/fluid/deep_attention_matching_net/README.md
@@ -55,6 +55,12 @@ for more detailed explanation about the arguments, please run
 python ../train_and_evaluate.py --help
 ```

+By default, the training is executed on one single GPU, which can be switched to multiple-GPU mode easily by simply resetting the visible devices in `train.sh`, e.g.,
+
+```
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+```
+
 4) Run test by

 ```

--- a/fluid/deep_attention_matching_net/douban/test.sh
+++ b/fluid/deep_attention_matching_net/douban/test.sh
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0
 python -u ../test_and_evaluate.py --use_cuda \
                --ext_eval \
                --data_path ./data/data.pkl \
-                --save_path ./eval_10000 \
-                --model_path models/step_10000 \
-                --batch_size 100 \
+                --save_path ./eval_3900 \
+                --model_path models/step_3900 \
+                --channel1_num 16 \
+                --batch_size 200 \
                --vocab_size 172130 \
                --emb_size 200 \
                --_EOS_ 1

--- a/fluid/deep_attention_matching_net/douban/train.sh
+++ b/fluid/deep_attention_matching_net/douban/train.sh
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0
+export FLAGS_eager_delete_tensor_gb=0.0
 python -u ../train_and_evaluate.py --use_cuda \
                --data_path ./data/data.pkl \
                --ext_eval \
                --word_emb_init ./data/word_embedding.pkl \
                --save_path ./models \
-                --batch_size 100 \
+                --batch_size 256 \
                --vocab_size 172130 \
+                --channel1_num 16 \
                --emb_size 200 \
                --_EOS_ 1
                
--- a/fluid/deep_attention_matching_net/model.py
+++ b/fluid/deep_attention_matching_net/model.py
@@ -6,18 +6,25 @@ import utils.layers as layers

 class Net(object):
    def __init__(self, max_turn_num, max_turn_len, vocab_size, emb_size,
-                 stack_num):
+                 stack_num, channel1_num, channel2_num):

        self._max_turn_num = max_turn_num
        self._max_turn_len = max_turn_len
        self._vocab_size = vocab_size
        self._emb_size = emb_size
        self._stack_num = stack_num
+        self._channel1_num = channel1_num
+        self._channel2_num = channel2_num
        self.word_emb_name = "shared_word_emb"
        self.use_stack_op = True
        self.use_mask_cache = True
        self.use_sparse_embedding = True

+    def set_word_embedding(self, word_emb, place):
+        word_emb_param = fluid.global_scope().find_var(
+            self.word_emb_name).get_tensor()
+        word_emb_param.set(word_emb, place)
+
    def create_network(self):
        mask_cache = dict() if self.use_mask_cache else None

@@ -136,7 +143,7 @@ class Net(object):
                t_a_r = fluid.layers.concat(input=t_a_r_stack, axis=1)
                r_a_t = fluid.layers.concat(input=r_a_t_stack, axis=1)

-            # sim shape: [batch_size, 2*(stack_num+2), max_turn_len, max_turn_len]    
+            # sim shape: [batch_size, 2*(stack_num+1), max_turn_len, max_turn_len]    
            sim = fluid.layers.matmul(
                x=t_a_r, y=r_a_t, transpose_y=True, alpha=1 / np.sqrt(200.0))
            sim_turns.append(sim)
@@ -147,10 +154,9 @@ class Net(object):
            for index in six.moves.xrange(len(sim_turns)):
                sim_turns[index] = fluid.layers.unsqueeze(
                    input=sim_turns[index], axes=[2])
-            # sim shape: [batch_size, 2*(stack_num+2), max_turn_num, max_turn_len, max_turn_len]
+            # sim shape: [batch_size, 2*(stack_num+1), max_turn_num, max_turn_len, max_turn_len]
            sim = fluid.layers.concat(input=sim_turns, axis=2)

-        # for douban
-        final_info = layers.cnn_3d(sim, 32, 16)
+        final_info = layers.cnn_3d(sim, self._channel1_num, self._channel2_num)
        loss, logits = layers.loss(final_info, label)
        return loss, logits
--- a/fluid/deep_attention_matching_net/test_and_evaluate.py
+++ b/fluid/deep_attention_matching_net/test_and_evaluate.py
@@ -89,6 +89,16 @@ def parse_args():
        type=int,
        default=5,
        help='The number of stacked attentive modules in network.')
+    parser.add_argument(
+        '--channel1_num',
+        type=int,
+        default=32,
+        help="The channels' number of the 1st conv3d layer's output.")
+    parser.add_argument(
+        '--channel2_num',
+        type=int,
+        default=16,
+        help="The channels' number of the 2nd conv3d layer's output.")
    args = parser.parse_args()
    return args

@@ -110,7 +120,8 @@ def test(args):
    }

    dam = Net(args.max_turn_num, args.max_turn_len, args.vocab_size,
-              args.emb_size, args.stack_num)
+              args.emb_size, args.stack_num, args.channel1_num,
+              args.channel2_num)
    loss, logits = dam.create_network()

    loss.persistable = True

--- a/fluid/deep_attention_matching_net/train_and_evaluate.py
+++ b/fluid/deep_attention_matching_net/train_and_evaluate.py
@@ -88,6 +88,16 @@ def parse_args():
        type=int,
        default=5,
        help='The number of stacked attentive modules in network.')
+    parser.add_argument(
+        '--channel1_num',
+        type=int,
+        default=32,
+        help="The channels' number of the 1st conv3d layer's output.")
+    parser.add_argument(
+        '--channel2_num',
+        type=int,
+        default=16,
+        help="The channels' number of the 2nd conv3d layer's output.")
    args = parser.parse_args()
    return args

@@ -105,7 +115,8 @@ def train(args):
    }

    dam = Net(args.max_turn_num, args.max_turn_len, args.vocab_size,
-              args.emb_size, args.stack_num)
+              args.emb_size, args.stack_num, args.channel1_num,
+              args.channel2_num)
    loss, logits = dam.create_network()

    loss.persistable = True
@@ -136,6 +147,9 @@ def train(args):
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    print("device count %d" % dev_count)
+    print("theoretical memory usage: ")
+    print(fluid.contrib.memory_usage(
+        program=train_program, batch_size=args.batch_size))

    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
@@ -157,7 +171,8 @@ def train(args):
        print("start loading word embedding init ...")
        word_emb = np.array(pickle.load(open(args.word_emb_init, 'rb'))).astype(
            'float32')
-        print("finish loading word embedding init  ...")
+        dam.set_word_embedding(word_emb, place)
+        print("finish init word embedding  ...")

    print("start loading data ...")
    train_data, val_data, test_data = pickle.load(open(args.data_path, 'rb'))
@@ -171,8 +186,6 @@ def train(args):
    print_step = max(1, batch_num / (dev_count * 100))
    save_step = max(1, batch_num / (dev_count * 10))

-    word_emb_inited = False
-
    print("begin model training ...")
    print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))

@@ -187,12 +200,8 @@ def train(args):
            for dev in xrange(dev_count):
                index = it * dev_count + dev
                feed_dict = reader.make_one_batch_input(train_batches, index)
-                if word_emb_inited is False and args.word_emb_init is not None:
-                    feed_dict[dam.word_emb_name] = word_emb
                feed_list.append(feed_dict)

-            word_emb_inited = True
-
            cost = train_exe.run(feed=feed_list, fetch_list=[loss.name])

            ave_cost += np.array(cost[0]).mean()

--- a/fluid/deep_attention_matching_net/ubuntu/test.sh
+++ b/fluid/deep_attention_matching_net/ubuntu/test.sh
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0
 python -u ../test_and_evaluate.py --use_cuda \
                --data_path ./data/data.pkl \
-                --save_path ./ \
-                --model_path models/step_10000 \
-                --batch_size 100 \
+                --save_path ./step_3900 \
+                --model_path ./models/step_3900 \
+                --batch_size 200 \
                --vocab_size 434512 \
                --emb_size 200 \
                --_EOS_ 28270

--- a/fluid/deep_attention_matching_net/ubuntu/train.sh
+++ b/fluid/deep_attention_matching_net/ubuntu/train.sh
-export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=0
+export FLAGS_eager_delete_tensor_gb=0.0
 python -u ../train_and_evaluate.py --use_cuda \
                --data_path ./data/data.pkl \
                --word_emb_init ./data/word_embedding.pkl \
                --save_path ./models \
-                --batch_size 100 \
+                --batch_size 256 \
                --vocab_size 434512 \
                --emb_size 200 \
                --_EOS_ 28270

--- a/fluid/deeplabv3+/.gitignore
+++ b/fluid/deeplabv3+/.gitignore
+deeplabv3plus_xception65_initialize.params
+deeplabv3plus.params
+deeplabv3plus.tar.gz
--- a/fluid/deeplabv3+/README.md
+++ b/fluid/deeplabv3+/README.md
-DeepLab运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。如果您的PaddlePaddle安装版本低于此要求，请按照[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明更新PaddlePaddle安装版本。
+DeepLab运行本目录下的程序示例需要使用PaddlePaddle Fluid v1.0.0版本或以上。如果您的PaddlePaddle安装版本低于此要求，请按照安装文档中的说明更新PaddlePaddle安装版本，如果使用GPU，该程序需要使用cuDNN v7版本。


 ## 代码结构
@@ -41,10 +41,12 @@ data/cityscape/
 如果需要从头开始训练模型，用户需要下载我们的初始化模型
 ```
 wget http://paddlemodels.cdn.bcebos.com/deeplab/deeplabv3plus_xception65_initialize.tar.gz
+tar -xf deeplabv3plus_xception65_initialize.tar.gz && rm deeplabv3plus_xception65_initialize.tar.gz
 ```
 如果需要最终训练模型进行fine tune或者直接用于预测，请下载我们的最终模型
 ```
 wget http://paddlemodels.cdn.bcebos.com/deeplab/deeplabv3plus.tar.gz
+tar -xf deeplabv3plus.tar.gz && rm deeplabv3plus.tar.gz
 ```


@@ -70,11 +72,11 @@ python train.py --help
 ```
 python ./train.py \
    --batch_size=8 \
-    --parallel=true
+    --parallel=true \
    --train_crop_size=769 \
    --total_step=90000 \
-    --init_weights_path=$INIT_WEIGHTS_PATH \
-    --save_weights_path=$SAVE_WEIGHTS_PATH \
+    --init_weights_path=deeplabv3plus_xception65_initialize.params \
+    --save_weights_path=output \
    --dataset_path=$DATASET_PATH
 ```

@@ -82,11 +84,10 @@ python ./train.py \
 执行以下命令在`Cityscape`测试数据集上进行测试：
 ```
 python ./eval.py \
-    --init_weights_path=$INIT_WEIGHTS_PATH \
+    --init_weights=deeplabv3plus.params \
    --dataset_path=$DATASET_PATH
 ```
-需要通过选项`--model_path`指定模型文件。
-测试脚本的输出的评估指标为[mean IoU]()。
+需要通过选项`--model_path`指定模型文件。测试脚本的输出的评估指标为mean IoU。


 ## 实验结果

--- a/fluid/deeplabv3+/eval.py
+++ b/fluid/deeplabv3+/eval.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import os
 os.environ['FLAGS_fraction_of_gpu_memory_to_use'] = '0.98'

@@ -91,7 +94,7 @@ exe = fluid.Executor(place)
 exe.run(sp)

 if args.init_weights_path:
-    print "load from:", args.init_weights_path
+    print("load from:", args.init_weights_path)
    load_model()

 dataset = CityscapeDataset(args.dataset_path, 'val')
@@ -118,7 +121,7 @@ for i, imgs, labels, names in batches:
    mp = (wrong + right) != 0
    miou2 = np.mean((right[mp] * 1.0 / (right[mp] + wrong[mp])))
    if args.verbose:
-        print 'step: %s, mIoU: %s' % (i + 1, miou2)
+        print('step: %s, mIoU: %s' % (i + 1, miou2))
    else:
-        print '\rstep: %s, mIoU: %s' % (i + 1, miou2),
+        print('\rstep: %s, mIoU: %s' % (i + 1, miou2))
        sys.stdout.flush()
--- a/fluid/deeplabv3+/models.py
+++ b/fluid/deeplabv3+/models.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import paddle
 import paddle.fluid as fluid

@@ -50,7 +53,7 @@ def append_op_result(result, name):

 def conv(*args, **kargs):
    kargs['param_attr'] = name_scope + 'weights'
-    if kargs.has_key('bias_attr') and kargs['bias_attr']:
+    if 'bias_attr' in kargs and kargs['bias_attr']:
        kargs['bias_attr'] = name_scope + 'biases'
    else:
        kargs['bias_attr'] = False
@@ -62,7 +65,7 @@ def group_norm(input, G, eps=1e-5, param_attr=None, bias_attr=None):

    N, C, H, W = input.shape
    if C % G != 0:
-        print "group can not divide channle:", C, G
+        print("group can not divide channle:", C, G)
        for d in range(10):
            for t in [d, -d]:
                if G + t <= 0: continue
@@ -70,7 +73,7 @@ def group_norm(input, G, eps=1e-5, param_attr=None, bias_attr=None):
                    G = G + t
                    break
            if C % G == 0:
-                print "use group size:", G
+                print("use group size:", G)
                break
    assert C % G == 0
    param_shape = (G, )
@@ -139,7 +142,7 @@ def seq_conv(input, channel, stride, filter, dilation=1, act=None):
            filter,
            stride,
            groups=input.shape[1],
-            padding=(filter / 2) * dilation,
+            padding=(filter // 2) * dilation,
            dilation=dilation)
        input = bn(input)
        if act: input = act(input)

--- a/fluid/deeplabv3+/reader.py
+++ b/fluid/deeplabv3+/reader.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import cv2
 import numpy as np
+import os
+import six

 default_config = {
    "shuffle": True,
@@ -30,7 +35,7 @@ def slice_with_pad(a, s, value=0):
                pr = 0
            pads.append([pl, pr])
            slices.append([l, r])
-    slices = map(lambda x: slice(x[0], x[1], 1), slices)
+    slices = list(map(lambda x: slice(x[0], x[1], 1), slices))
    a = a[slices]
    a = np.pad(a, pad_width=pads, mode='constant', constant_values=value)
    return a
@@ -38,11 +43,17 @@ def slice_with_pad(a, s, value=0):

 class CityscapeDataset:
    def __init__(self, dataset_dir, subset='train', config=default_config):
-        import commands
-        label_dirname = dataset_dir + 'gtFine/' + subset
-        label_files = commands.getoutput(
-            "find %s -type f | grep labelTrainIds | sort" %
-            label_dirname).splitlines()
+        label_dirname = os.path.join(dataset_dir, 'gtFine/' + subset)
+        if six.PY2:
+            import commands
+            label_files = commands.getoutput(
+                "find %s -type f | grep labelTrainIds | sort" %
+                label_dirname).splitlines()
+        else:
+            import subprocess
+            label_files = subprocess.getstatusoutput(
+                "find %s -type f | grep labelTrainIds | sort" %
+                label_dirname)[-1].splitlines()
        self.label_files = label_files
        self.label_dirname = label_dirname
        self.index = 0
@@ -50,7 +61,7 @@ class CityscapeDataset:
        self.dataset_dir = dataset_dir
        self.config = config
        self.reset()
-        print "total number", len(label_files)
+        print("total number", len(label_files))

    def reset(self, shuffle=False):
        self.index = 0
@@ -66,13 +77,14 @@ class CityscapeDataset:
        shape = self.config["crop_size"]
        while True:
            ln = self.label_files[self.index]
-            img_name = self.dataset_dir + 'leftImg8bit/' + self.subset + ln[len(
-                self.label_dirname):]
+            img_name = os.path.join(
+                self.dataset_dir,
+                'leftImg8bit/' + self.subset + ln[len(self.label_dirname):])
            img_name = img_name.replace('gtFine_labelTrainIds', 'leftImg8bit')
            label = cv2.imread(ln)
            img = cv2.imread(img_name)
            if img is None:
-                print "load img failed:", img_name
+                print("load img failed:", img_name)
                self.next_img()
            else:
                break
@@ -128,5 +140,7 @@ class CityscapeDataset:
            from prefetch_generator import BackgroundGenerator
            batches = BackgroundGenerator(batches, 100)
        except:
-            print "You can install 'prefetch_generator' for acceleration of data reading."
+            print(
+                "You can install 'prefetch_generator' for acceleration of data reading."
+            )
        return batches
--- a/fluid/deeplabv3+/train.py
+++ b/fluid/deeplabv3+/train.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import os
 os.environ['FLAGS_fraction_of_gpu_memory_to_use'] = '0.98'

@@ -126,13 +129,12 @@ exe = fluid.Executor(place)
 exe.run(sp)

 if args.init_weights_path:
-    print "load from:", args.init_weights_path
+    print("load from:", args.init_weights_path)
    load_model()

 dataset = CityscapeDataset(args.dataset_path, 'train')

 if args.parallel:
-    print "Using ParallelExecutor."
    exe_p = fluid.ParallelExecutor(
        use_cuda=True, loss_name=loss_mean.name, main_program=tp)

@@ -149,9 +151,9 @@ for i, imgs, labels, names in batches:
                             'label': labels},
                       fetch_list=[pred, loss_mean])
    if i % 100 == 0:
-        print "Model is saved to", args.save_weights_path
+        print("Model is saved to", args.save_weights_path)
        save_model()
-    print "step %s, loss: %s" % (i, np.mean(retv[1]))
+    print("step %s, loss: %s" % (i, np.mean(retv[1])))

-print "Training done. Model is saved to", args.save_weights_path
+print("Training done. Model is saved to", args.save_weights_path)
 save_model()
--- a/fluid/face_detection/.gitignore
+++ b/fluid/face_detection/.gitignore
@@ -10,3 +10,4 @@ output*
 pred
 eval_tools
 box*
+PyramidBox_WiderFace*
--- a/fluid/face_detection/pyramidbox.py
+++ b/fluid/face_detection/pyramidbox.py
@@ -427,6 +427,7 @@ class PyramidBox(object):
            overlap_threshold=0.35,
            neg_overlap=0.35)
        loss = fluid.layers.reduce_sum(loss)
+        loss.persistable = True
        return loss

    def train(self):

--- a/fluid/face_detection/train.py
+++ b/fluid/face_detection/train.py
@@ -189,13 +189,13 @@ def train(args, config, train_params, train_file_list):
                fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
                if batch_id % 10 == 0:
                    if not args.use_pyramidbox:
-                        print("Pass {0}, batch {1}, loss {2}, time {3}".format(
+                        print("Pass {:d}, batch {:d}, loss {:.6f}, time {:.5f}".format(
                            pass_id, batch_id, fetch_vars[0],
                            start_time - prev_start_time))
                    else:
-                        print("Pass {0}, batch {1}, face loss {2}, " \
-                              "head loss {3}, " \
-                              "time {4}".format(pass_id,
+                        print("Pass {:d}, batch {:d}, face loss {:.6f}, " \
+                              "head loss {:.6f}, " \
+                              "time {:.5f}".format(pass_id,
                               batch_id, fetch_vars[0], fetch_vars[1],
                               start_time - prev_start_time))
            if pass_id % 1 == 0 or pass_id == epoc_num - 1:

--- a/fluid/face_detection/widerface_eval.py
+++ b/fluid/face_detection/widerface_eval.py
@@ -82,9 +82,6 @@ def save_widerface_bboxes(image_path, bboxes_scores, output_dir):
    image_name = image_path.split('/')[-1]
    image_class = image_path.split('/')[-2]

-    image_name = image_name.encode('utf-8')
-    image_class = image_class.encode('utf-8')
-
    odir = os.path.join(output_dir, image_class)
    if not os.path.exists(odir):
        os.makedirs(odir)

--- a/fluid/faster_rcnn/README.md
+++ b/fluid/faster_rcnn/README.md
+# Faster RCNN Objective Detection
+
+---
+## Table of Contents
+
+- [Installation](#installation)
+- [Introduction](#introduction)
+- [Data preparation](#data-preparation)
+- [Training](#training)
+- [Finetuning](#finetuning)
+- [Evaluation](#evaluation)
+- [Inference and Visualization](#inference-and-visualization)
+- [Appendix](#appendix)
+
+## Installation
+
+Running sample code in this directory requires PaddelPaddle Fluid v.1.0.0 and later. If the PaddlePaddle on your device is lower than this version, please follow the instructions in [installation document](http://www.paddlepaddle.org/documentation/docs/zh/0.15.0/beginners_guide/install/install_doc.html#paddlepaddle) and make an update.
+
+## Introduction
+
+[Faster Rcnn](https://arxiv.org/abs/1506.01497) is a typical two stage detector. The total framework of network can be divided into four parts, as shown below:
+<p align="center">
+<img src="image/Faster_RCNN.jpg" height=400 width=400 hspace='10'/> <br />
+Faster RCNN model
+</p>
+
+1. Base conv layer。As a CNN objective dection, Faster RCNN extract feature maps using a basic convolutional network. The feature maps then can be shared by RPN and fc layers. This sampel uses [ResNet-50](https://arxiv.org/abs/1512.03385) as base conv layer.
+2. Region Proposal Network (RPN)。RPN generates proposals for detection。This block generates anchors by a set of size and ratio and classifies anchors into fore-ground and back-ground by softmax. Then refine anchors to obtain more precise proposals using box regression.
+3. RoI pooling。This layer takes feature maps and proposals as input. The proposals are mapped to feature maps and pooled to the same size. The output are sent to fc layers for classification and regression.
+4. Detection layer。Using the output of roi pooling to compute the class and locatoin of each proposal in two fc layers.
+
+## Data preparation
+
+Train the model on [MS-COCO dataset](http://cocodataset.org/#download), download dataset as below:
+
+    cd dataset/coco
+    ./download.sh
+
+
+## Training
+
+After data preparation, one can start the training step by:
+
+    python train.py \
+       --max_size=1333 \
+       --scales=800 \
+       --batch_size=8 \
+       --model_save_dir=output/
+
+- Set ```export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7``` to specifiy 8 GPU to train.
+- For more help on arguments:
+
+    python train.py --help
+
+**download the pre-trained model:** This sample provides Resnet-50 pre-trained model which is converted from Caffe. The model fuses the parameters in batch normalization layer. One can download pre-trained model as:
+
+    sh ./pretrained/download.sh
+
+Set `pretrained_model` to load pre-trained model. In addition, this parameter is used to load trained model when finetuning as well.
+
+**data reader introduction:**
+
+* Data reader is defined in `reader.py`.
+* Scaling the short side of all images to `scales`. If the long side is larger than `max_size`, then scaling the long side to `max_size`.
+* In training stage, images are horizontally flipped.
+* Images in the same batch can be padding to the same size.
+
+**model configuration:**
+
+* Use RoIPooling.
+* NMS threshold=0.7. During training, pre\_nms=12000, post\_nms=2000; during test, pre\_nms=6000, post\_nms=1000.
+* In generating proposal lables, fg\_fraction=0.25, fg\_thresh=0.5, bg\_thresh_hi=0.5, bg\_thresh\_lo=0.0.
+* In rpn target assignment, rpn\_fg\_fraction=0.5, rpn\_positive\_overlap=0.7, rpn\_negative\_overlap=0.3.
+
+**training strategy:**
+
+*  Use momentum optimizer with momentum=0.9.
+*  Weight decay is 0.0001.
+*  In first 500 iteration, the learning rate increases linearly from 0.00333 to 0.01. Then lr is decayed at 120000, 160000 iteration with multiplier 0.1, 0.01. The maximum iteration is 180000.
+*  Set the learning rate of bias to two times as global lr in non basic convolutional layers.
+*  In basic convolutional layers, parameters of affine layers and res body do not update.
+*  Use Nvidia Tesla V100 8GPU, total time for training is about 40 hours.
+
+Training result is shown as below：
+<p align="center">
+<img src="image/train_loss.jpg" height=500 width=650 hspace='10'/> <br />
+Faster RCNN train loss
+</p>
+* Fluid all padding: Each image padding to 1333\*1333.
+* Fluid minibatch padding: Images in one batch padding to the same size. This method is same as detectron.
+* Fluid no padding: Images without padding.
+
+## Finetuning
+
+Finetuning is to finetune model weights in a specific task by loading pretrained weights. After initializing ```pretrained_model```, one can finetune a model as:
+
+    python train.py
+        --max_size=1333 \
+        --scales=800 \
+        --pretrained_model=${path_to_pretrain_model} \
+        --batch_size= 8\
+        --model_save_dir=output/
+
+## Evaluation
+
+Evaluation is to evaluate the performance of a trained model. This sample provides `eval_coco_map.py` which uses a COCO-specific mAP metric defined by [COCO committee](http://cocodataset.org/#detections-eval). To use `eval_coco_map.py` , [cocoapi](https://github.com/cocodataset/cocoapi) is needed. Install the cocoapi:
+
+    # COCOAPI=/path/to/clone/cocoapi
+    git clone https://github.com/cocodataset/cocoapi.git $COCOAPI
+    cd $COCOAPI/PythonAPI
+    # if cython is not installed
+    pip install Cython
+    # Install into global site-packages
+    make install
+    # Alternatively, if you do not have permissions or prefer
+    # not to install the COCO API into global site-packages
+    python2 setup.py install --user
+
+`eval_coco_map.py` is the main executor for evalution, one can start evalution step by:
+
+    python eval_coco_map.py \
+        --dataset=coco2017 \
+        --pretrained_mode=${path_to_pretrain_model} \
+        --batch_size=1 \
+        --nms_threshold=0.5 \
+        --score_threshold=0.05
+
+Evalutaion result is shown as below:
+<p align="center">
+<img src="image/mAP.jpg" height=500 width=650 hspace='10'/> <br />
+Faster RCNN mAP
+</p>
+
+| Model                    | Batch size     | Max iteration    | mAP  |
+| :------------------------------ | :------------:    | :-------------------:|------: |
+| Detectron                 | 8            |    180000        | 0.315 |
+| Fluid minibatch padding | 8            |    180000        | 0.314 |
+| Fluid all padding         | 8            |    180000        | 0.308 |
+| Fluid no padding         |6            |    240000        | 0.317 |
+
+* Fluid all padding: Each image padding to 1333\*1333.
+* Fluid minibatch padding: Images in one batch padding to the same size. This method is same as detectron.
+* Fluid no padding: Images without padding.
+
+## Inference and Visualization
+
+Inference is used to get prediction score or image features based on trained models. `infer.py`  is the main executor for inference, one can start infer step by:
+
+    python infer.py \
+       --dataset=coco2017 \
+        --pretrained_model=${path_to_pretrain_model}  \
+        --image_path=data/COCO17/val2017/  \
+        --image_name=000000000139.jpg \
+        --draw_threshold=0.6
+
+Visualization of infer result is shown as below:
+<p align="center">
+<img src="image/000000000139.jpg" height=300 width=400 hspace='10'/>
+<img src="image/000000127517.jpg" height=300 width=400 hspace='10'/>
+<img src="image/000000203864.jpg" height=300 width=400 hspace='10'/>
+<img src="image/000000515077.jpg" height=300 width=400 hspace='10'/> <br />
+Faster RCNN Visualization Examples
+</p>
--- a/fluid/faster_rcnn/README_cn.md
+++ b/fluid/faster_rcnn/README_cn.md
+# Faster RCNN 目标检测
+
+---
+## 内容
+
+- [安装](#安装)
+- [简介](#简介)
+- [数据准备](#数据准备)
+- [模型训练](#模型训练)
+- [参数微调](#参数微调)
+- [模型评估](#模型评估)
+- [模型推断及可视化](#模型推断及可视化)
+- [附录](#附录)
+
+## 安装
+
+在当前目录下运行样例代码需要PadddlePaddle Fluid的v.1.0.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本，请根据[安装文档](http://www.paddlepaddle.org/documentation/docs/zh/0.15.0/beginners_guide/install/install_doc.html#paddlepaddle)中的说明来更新PaddlePaddle。
+
+## 简介
+
+[Faster Rcnn](https://arxiv.org/abs/1506.01497) 是典型的两阶段目标检测器。如下图所示，整体网络可以分为4个主要内容：
+<p align="center">
+<img src="image/Faster_RCNN.jpg" height=400 width=400 hspace='10'/> <br />
+Faster RCNN 目标检测模型
+</p>
+
+1. 基础卷积层。作为一种卷积神经网络目标检测方法，Faster RCNN首先使用一组基础的卷积网络提取图像的特征图。特征图被后续RPN层和全连接层共享。本示例采用[ResNet-50](https://arxiv.org/abs/1512.03385)作为基础卷积层。
+2. 区域生成网络(RPN)。RPN网络用于生成候选区域(proposals)。该层通过一组固定的尺寸和比例得到一组锚点(anchors), 通过softmax判断锚点属于前景或者背景，再利用区域回归修正锚点从而获得精确的候选区域。
+3. RoI池化。该层收集输入的特征图和候选区域，将候选区域映射到特征图中并池化为统一大小的区域特征图，送入全连接层判定目标类别。
+4. 检测层。利用区域特征图计算候选区域的类别，同时再次通过区域回归获得检测框最终的精确位置。
+
+## 数据准备
+
+在[MS-COCO数据集](http://cocodataset.org/#download)上进行训练，通过如下方式下载数据集。
+
+    cd dataset/coco
+    ./download.sh
+
+## 模型训练
+
+数据准备完毕后，可以通过如下的方式启动训练：
+
+    python train.py \
+       --max_size=1333 \
+       --scales=800 \
+       --batch_size=8 \
+       --model_save_dir=output/ \
+       --pretrained_model=${path_to_pretrain_model}
+
+- 通过设置export CUDA\_VISIBLE\_DEVICES=0,1,2,3,4,5,6,7指定8卡GPU训练。
+- 可选参数见：
+
+    python train.py --help
+
+**下载预训练模型：** 本示例提供Resnet-50预训练模型，该模性转换自Caffe，并对批标准化层(Batch Normalization Layer)进行参数融合。采用如下命令下载预训练模型：
+
+    sh ./pretrained/download.sh
+
+通过初始化`pretrained_model` 加载预训练模型。同时在参数微调时也采用该设置加载已训练模型。
+
+**数据读取器说明：** 数据读取器定义在reader.py中。所有图像将短边等比例缩放至`scales`，若长边大于`max_size`, 则再次将长边等比例缩放至`max_iter`。在训练阶段，对图像采用水平翻转。支持将同一个batch内的图像padding为相同尺寸。
+
+**模型设置：**
+
+* 使用RoIPooling。
+* 训练过程pre\_nms=12000, post\_nms=2000，测试过程pre\_nms=6000, post\_nms=1000。nms阈值为0.7。
+* RPN网络得到labels的过程中，fg\_fraction=0.25，fg\_thresh=0.5，bg\_thresh_hi=0.5，bg\_thresh\_lo=0.0
+* RPN选择anchor时，rpn\_fg\_fraction=0.5，rpn\_positive\_overlap=0.7，rpn\_negative\_overlap=0.3
+
+
+下图为模型训练结果：
+<p align="center">
+<img src="image/train_loss.jpg" height=500 width=650 hspace='10'/> <br />
+Faster RCNN 训练loss
+</p>
+* Fluid all padding: 每张图像填充为1333\*1333大小。
+* Fluid minibatch padding: 同一个batch内的图像填充为相同尺寸。该方法与detectron处理相同。
+* Fluid no padding: 不对图像做填充处理。
+
+**训练策略：**
+
+*  采用momentum优化算法训练Faster RCNN，momentum=0.9。
+*  权重衰减系数为0.0001，前500轮学习率从0.00333线性增加至0.01。在120000，160000轮时使用0.1,0.01乘子进行学习率衰减，最大训练180000轮。
+*  非基础卷积层卷积bias学习率为整体学习率2倍。
+*  基础卷积层中，affine_layers参数不更新，res2层参数不更新。
+*  使用Nvidia Tesla V100 8卡并行，总共训练时长大约40小时。
+
+## 模型评估
+
+模型评估是指对训练完毕的模型评估各类性能指标。本示例采用[COCO官方评估](http://cocodataset.org/#detections-eval)，使用前需要首先下载[cocoapi](https://github.com/cocodataset/cocoapi)：
+
+    # COCOAPI=/path/to/clone/cocoapi
+    git clone https://github.com/cocodataset/cocoapi.git $COCOAPI
+    cd $COCOAPI/PythonAPI
+    # if cython is not installed
+    pip install Cython
+    # Install into global site-packages
+    make install
+    # Alternatively, if you do not have permissions or prefer
+    # not to install the COCO API into global site-packages
+    python2 setup.py install --user
+
+`eval_coco_map.py`是评估模块的主要执行程序，调用示例如下：
+
+    python eval_coco_map.py \
+        --dataset=coco2017 \
+        --pretrained_mode=${path_to_pretrain_model} \
+        --batch_size=1 \
+        --nms_threshold=0.5 \
+        --score_threshold=0.05
+
+下图为模型评估结果：
+<p align="center">
+<img src="image/mAP.jpg" height=500 width=650 hspace='10'/> <br />
+Faster RCNN mAP
+</p>
+
+| 模型                    | 批量大小     | 迭代次数        | mAP  |
+| :------------------------------ | :------------:    | :------------------:    |------: |
+| Detectron                 | 8            |    180000        | 0.315 |
+| Fluid minibatch padding | 8            |    180000        | 0.314 |
+| Fluid all padding         | 8            |    180000        | 0.308 |
+| Fluid no padding            |6            |    240000        | 0.317 |
+
+* Fluid all padding: 每张图像填充为1333\*1333大小。
+* Fluid minibatch padding: 同一个batch内的图像填充为相同尺寸。该方法与detectron处理相同。
+* Fluid no padding: 不对图像做填充处理。
+
+## 模型推断及可视化
+
+模型推断可以获取图像中的物体及其对应的类别，`infer.py`是主要执行程序，调用示例如下：
+
+    python infer.py \
+       --dataset=coco2017 \
+        --pretrained_model=${path_to_pretrain_model}  \
+        --image_path=data/COCO17/val2017/  \
+        --image_name=000000000139.jpg \
+        --draw_threshold=0.6
+
+下图为模型可视化预测结果：
+<p align="center">
+<img src="image/000000000139.jpg" height=300 width=400 hspace='10'/>
+<img src="image/000000127517.jpg" height=300 width=400 hspace='10'/>
+<img src="image/000000203864.jpg" height=300 width=400 hspace='10'/>
+<img src="image/000000515077.jpg" height=300 width=400 hspace='10'/> <br />
+Faster RCNN 预测可视化
+</p>
--- a/fluid/faster_rcnn/eval_coco_map.py
+++ b/fluid/faster_rcnn/eval_coco_map.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import os
 import time
 import numpy as np
-import argparse
-import functools
 from eval_helper import get_nmsed_box
 from eval_helper import get_dt_res
 import paddle
 import paddle.fluid as fluid
 import reader
 from utility import print_arguments, parse_args
-# A special mAP metric for COCO dataset, which averages AP in different IoUs.
-# To use this eval_coco_map.py, [cocoapi](https://github.com/cocodataset/cocoapi) is needed.
 import models.model_builder as model_builder
 import models.resnet as resnet
 import json

--- a/fluid/faster_rcnn/eval_helper.py
+++ b/fluid/faster_rcnn/eval_helper.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
 import os
 import numpy as np
 import paddle.fluid as fluid

--- a/fluid/faster_rcnn/image/000000000139.jpg
+++ b/fluid/faster_rcnn/image/000000000139.jpg
--- a/fluid/faster_rcnn/image/000000127517.jpg
+++ b/fluid/faster_rcnn/image/000000127517.jpg
--- a/fluid/faster_rcnn/image/000000203864.jpg
+++ b/fluid/faster_rcnn/image/000000203864.jpg
--- a/fluid/faster_rcnn/image/000000515077.jpg
+++ b/fluid/faster_rcnn/image/000000515077.jpg
--- a/fluid/faster_rcnn/image/Faster_RCNN.jpg
+++ b/fluid/faster_rcnn/image/Faster_RCNN.jpg
--- a/fluid/faster_rcnn/image/mAP.jpg
+++ b/fluid/faster_rcnn/image/mAP.jpg
--- a/fluid/faster_rcnn/image/train_loss.jpg
+++ b/fluid/faster_rcnn/image/train_loss.jpg
--- a/fluid/faster_rcnn/learning_rate.py
+++ b/fluid/faster_rcnn/learning_rate.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

--- a/fluid/faster_rcnn/models/model_builder.py
+++ b/fluid/faster_rcnn/models/model_builder.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
 import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Constant

--- a/fluid/faster_rcnn/models/resnet.py
+++ b/fluid/faster_rcnn/models/resnet.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
 import paddle.fluid as fluid
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.initializer import Constant

--- a/fluid/faster_rcnn/pretrained/download.sh
+++ b/fluid/faster_rcnn/pretrained/download.sh
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd "$DIR"
+
+# Download the data.
+echo "Downloading..."
+wget http://paddlemodels.bj.bcebos.com/faster_rcnn/imagenet_resnet50_fusebn.tar.gz
+echo "Extracting..."
+tar -xf imagenet_resnet50_fusebn.tar.gz
--- a/fluid/faster_rcnn/profile.py
+++ b/fluid/faster_rcnn/profile.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
 import os
 import time
 import numpy as np
 import argparse
-import functools
-import shutil
-import cPickle
-from utility import add_arguments, print_arguments
+from utility import parse_args, add_arguments, print_arguments

 import paddle
 import paddle.fluid as fluid
@@ -16,50 +27,12 @@ import models.model_builder as model_builder
 import models.resnet as resnet
 from learning_rate import exponential_with_warmup_decay

-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-# ENV
-add_arg('parallel',         bool,   True,       "Minibatch size.")
-add_arg('use_gpu',          bool,   True,      "Whether use GPU.")
-add_arg('model_save_dir',   str,    'model',     "The path to save model.")
-add_arg('pretrained_model', str,    'imagenet_resnet50_fusebn', "The init model path.")
-add_arg('dataset',          str,    'coco2017', "coco2014, coco2017, and pascalvoc.")
-add_arg('data_dir',         str,    'data/COCO17', "data directory")
-add_arg('skip_reader',      bool,  False,            "Whether to skip data reader.")
-add_arg('use_profile',      bool,  False,            "Whether to use profiler tool.")
-add_arg('class_num',        int,   81,          "Class number.")
-add_arg('use_pyreader',     bool,  False,          "Class number.")
-# SOLVER
-add_arg('learning_rate',    float,  0.01,     "Learning rate.")
-add_arg('num_iteration',    int,   10,              "Epoch number.")
-# RPN
-add_arg('anchor_sizes',     int,    [32,64,128,256,512],  "The size of anchors.")
-add_arg('aspect_ratios',    float,  [0.5,1.0,2.0],    "The ratio of anchors.")
-add_arg('variance',         float,  [1.,1.,1.,1.],    "The variance of anchors.")
-add_arg('rpn_stride',       float,  16.,    "Stride of the feature map that RPN is attached.")
-# FAST RCNN
-# TRAIN TEST
-add_arg('batch_size',       int,    1,          "Minibatch size.")
-add_arg('max_size',         int,    1333,    "The max resized image size.")
-add_arg('scales',           int,    [800],    "The resized image height.")
-add_arg('batch_size_per_im',int,    512,    "fast rcnn head batch size")
-add_arg('mean_value',       float,  [102.9801, 115.9465, 122.7717], "pixel mean")
-add_arg('debug',            bool,   False,   "Debug mode")
-#yapf: enable

 def train(cfg):
    batch_size = cfg.batch_size
    learning_rate = cfg.learning_rate
    image_shape = [3, cfg.max_size, cfg.max_size]
-    num_iterations = cfg.num_iteration
-
-    if cfg.debug:
-        fluid.default_startup_program().random_seed = 1000
-        fluid.default_main_program().random_seed = 1000
-        import random
-        random.seed(0)
-        np.random.seed(0)
+    num_iterations = cfg.max_iter

    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
@@ -72,21 +45,22 @@ def train(cfg):
        use_random=False)
    model.build_model(image_shape)
    loss_cls, loss_bbox, rpn_cls_loss, rpn_reg_loss = model.loss()
-    loss_cls.persistable=True
-    loss_bbox.persistable=True
-    rpn_cls_loss.persistable=True
-    rpn_reg_loss.persistable=True
+    loss_cls.persistable = True
+    loss_bbox.persistable = True
+    rpn_cls_loss.persistable = True
+    rpn_reg_loss.persistable = True
    loss = loss_cls + loss_bbox + rpn_cls_loss + rpn_reg_loss

    boundaries = [120000, 160000]
-    values = [learning_rate, learning_rate*0.1, learning_rate*0.01]
+    values = [learning_rate, learning_rate * 0.1, learning_rate * 0.01]

    optimizer = fluid.optimizer.Momentum(
-        learning_rate=exponential_with_warmup_decay(learning_rate=learning_rate,
+        learning_rate=exponential_with_warmup_decay(
+            learning_rate=learning_rate,
            boundaries=boundaries,
            values=values,
            warmup_iter=500,
-            warmup_factor=1.0/3.0),
+            warmup_factor=1.0 / 3.0),
        regularization=fluid.regularizer.L2Decay(0.0001),
        momentum=0.9)
    optimizer.minimize(loss)
@@ -98,22 +72,33 @@ def train(cfg):
    exe.run(fluid.default_startup_program())

    if cfg.pretrained_model:
+
        def if_exist(var):
            return os.path.exists(os.path.join(cfg.pretrained_model, var.name))
+
        fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist)

    if cfg.parallel:
        train_exe = fluid.ParallelExecutor(
            use_cuda=bool(cfg.use_gpu), loss_name=loss.name)

+    assert cfg.batch_size % devices_num == 0, \
+        "batch_size = %d, devices_num = %d" %(cfg.batch_size, devices_num)

+    batch_size_per_dev = cfg.batch_size / devices_num
    if cfg.use_pyreader:
-        train_reader = reader.train(cfg, batch_size=1, shuffle=not cfg.debug)
+        train_reader = reader.train(
+            cfg,
+            batch_size=batch_size_per_dev,
+            total_batch_size=cfg.batch_size,
+            padding_total=cfg.padding_minibatch,
+            shuffle=False)
        py_reader = model.py_reader
        py_reader.decorate_paddle_reader(train_reader)
    else:
-        train_reader = reader.train(cfg, batch_size=cfg.batch_size, shuffle=not cfg.debug)
-    feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())
+        train_reader = reader.train(
+            cfg, batch_size=cfg.batch_size, shuffle=False)
+        feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())

    fetch_list = [loss, loss_cls, loss_bbox, rpn_cls_loss, rpn_reg_loss]

@@ -124,22 +109,27 @@ def train(cfg):

        for batch_id in range(iterations):
            start_time = time.time()
-            data = train_reader().next()
+            data = next(train_reader())
            end_time = time.time()
            reader_time.append(end_time - start_time)
            start_time = time.time()
-            losses = train_exe.run(fetch_list=[v.name for v in fetch_list],
-                                   feed=feeder.feed(data))
+            if cfg.parallel:
+                losses = train_exe.run(fetch_list=[v.name for v in fetch_list],
+                                       feed=feeder.feed(data))
+            else:
+                losses = exe.run(fluid.default_main_program(),
+                                 fetch_list=[v.name for v in fetch_list],
+                                 feed=feeder.feed(data))
            end_time = time.time()
            run_time.append(end_time - start_time)
-            total_images += data[0][0].shape[0]
+            total_images += len(data)

-            lr = np.array(fluid.global_scope().find_var('learning_rate').get_tensor())
-            print("Batch {:d}, lr {:.6f}, loss {:.6f} ".format(
-                  batch_id, lr[0], losses[0][0]))
+            lr = np.array(fluid.global_scope().find_var('learning_rate')
+                          .get_tensor())
+            print("Batch {:d}, lr {:.6f}, loss {:.6f} ".format(batch_id, lr[0],
+                                                               losses[0][0]))
        return reader_time, run_time, total_images

-
    def run_pyreader(iterations):
        reader_time = [0]
        run_time = []
@@ -149,13 +139,19 @@ def train(cfg):
        try:
            for batch_id in range(iterations):
                start_time = time.time()
-                losses = train_exe.run(fetch_list=[v.name for v in fetch_list])
+                if cfg.parallel:
+                    losses = train_exe.run(
+                        fetch_list=[v.name for v in fetch_list])
+                else:
+                    losses = exe.run(fluid.default_main_program(),
+                                     fetch_list=[v.name for v in fetch_list])
                end_time = time.time()
                run_time.append(end_time - start_time)
                total_images += devices_num
-                lr = np.array(fluid.global_scope().find_var('learning_rate').get_tensor())
-                print("Batch {:d}, lr {:.6f}, loss {:.6f} ".format(
-                      batch_id, lr[0], losses[0][0]))
+                lr = np.array(fluid.global_scope().find_var('learning_rate')
+                              .get_tensor())
+                print("Batch {:d}, lr {:.6f}, loss {:.6f} ".format(batch_id, lr[
+                    0], losses[0][0]))
        except fluid.core.EOFException:
            py_reader.reset()

@@ -167,20 +163,23 @@ def train(cfg):
    run_func(2)
    # profiling
    start = time.time()
-    if cfg.use_profile:
+    use_profile = False
+    if use_profile:
        with profiler.profiler('GPU', 'total', '/tmp/profile_file'):
-            reader_time, run_time, total_images = run(num_iterations)
+            reader_time, run_time, total_images = run_func(num_iterations)
    else:
        reader_time, run_time, total_images = run_func(num_iterations)

    end = time.time()
    total_time = end - start
-    print("Total time: {0}, reader time: {1} s, run time: {2} s, images/s: {3}".format(
-        total_time, np.sum(reader_time), np.sum(run_time), total_images / total_time))
+    print("Total time: {0}, reader time: {1} s, run time: {2} s, images/s: {3}".
+          format(total_time,
+                 np.sum(reader_time),
+                 np.sum(run_time), total_images / total_time))


 if __name__ == '__main__':
-    args = parser.parse_args()
+    args = parse_args()
    print_arguments(args)

    data_args = reader.Settings(args)

--- a/fluid/faster_rcnn/reader.py
+++ b/fluid/faster_rcnn/reader.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -150,6 +150,8 @@ def coco(settings,

        else:
            for roidb in roidbs:
+                if settings.image_name not in roidb['image']:
+                    continue
                im, im_info, im_id = roidb_reader(roidb, mode)
                batch_out = [(im, im_info, im_id)]
                yield batch_out

--- a/fluid/faster_rcnn/roidbs.py
+++ b/fluid/faster_rcnn/roidbs.py
@@ -26,7 +26,6 @@ from __future__ import print_function
 from __future__ import unicode_literals

 import copy
-import cPickle as pickle
 import logging
 import numpy as np
 import os

--- a/fluid/faster_rcnn/train.py
+++ b/fluid/faster_rcnn/train.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import os
-import time
+import sys
 import numpy as np
-import argparse
-import functools
+import time
 import shutil
-import cPickle
 from utility import parse_args, print_arguments, SmoothedValue

 import paddle
@@ -117,7 +132,7 @@ def train(cfg):
                    iter_id, lr[0],
                    smoothed_loss.get_median_value(
                    ), start_time - prev_start_time))
-                #print('cls_loss ', losses[1][0], ' reg_loss ', losses[2][0], ' loss_cls ', losses[3][0], ' loss_bbox ', losses[4][0])
+                sys.stdout.flush()
                if (iter_id + 1) % cfg.snapshot_stride == 0:
                    save_model("model_iter{}".format(iter_id))
        except fluid.core.EOFException:
@@ -143,7 +158,7 @@ def train(cfg):
            print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format(
                iter_id, lr[0],
                smoothed_loss.get_median_value(), start_time - prev_start_time))
-            #print('cls_loss ', losses[1][0], ' reg_loss ', losses[2][0], ' loss_cls ', losses[3][0], ' loss_bbox ', losses[4][0])
+            sys.stdout.flush()
            if (iter_id + 1) % cfg.snapshot_stride == 0:
                save_model("model_iter{}".format(iter_id))
            if (iter_id + 1) == cfg.max_iter:

--- a/fluid/faster_rcnn/utility.py
+++ b/fluid/faster_rcnn/utility.py
-"""Contains common utility functions."""
 #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,6 +11,9 @@
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
+"""
+Contains common utility functions.
+"""

 from __future__ import absolute_import
 from __future__ import division
@@ -83,8 +85,7 @@ class SmoothedValue(object):


 def parse_args():
-    """
-	return all args
+    """return all args
    """
    parser = argparse.ArgumentParser(description=__doc__)
    add_arg = functools.partial(add_arguments, argparser=parser)

--- a/fluid/gan/c_gan/c_gan.py
+++ b/fluid/gan/c_gan/c_gan.py
@@ -12,8 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import sys
 import os
+import six
 import argparse
 import functools
 import matplotlib
@@ -40,7 +44,9 @@ add_arg('use_gpu',           bool,  True,       "Whether to use GPU to train.")


 def loss(x, label):
-    return fluid.layers.mean(x * (label - 0.5))
+    return fluid.layers.mean(
+        fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=x, label=label))


 def train(args):
@@ -67,7 +73,10 @@ def train(args):
        g_program_test = dg_program.clone(for_test=True)

        dg_logit = D_cond(g_img, conditions)
-        dg_loss = loss(dg_logit, 1)
+        dg_loss = loss(
+            dg_logit,
+            fluid.layers.fill_constant_batch_size_like(
+                input=noise, dtype='float32', shape=[-1, 1], value=1.0))

    opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)

@@ -97,7 +106,7 @@ def train(args):
            noise_data = np.random.uniform(
                low=-1.0, high=1.0,
                size=[args.batch_size, NOISE_SIZE]).astype('float32')
-            real_image = np.array(map(lambda x: x[0], data)).reshape(
+            real_image = np.array(list(map(lambda x: x[0], data))).reshape(
                -1, 784).astype('float32')
            conditions_data = np.array([x[1] for x in data]).reshape(
                [-1, 1]).astype("float32")
@@ -133,7 +142,7 @@ def train(args):

            d_loss_np = [d_loss_1[0][0], d_loss_2[0][0]]

-            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+            for _ in six.moves.xrange(NUM_TRAIN_TIMES_OF_DG):
                noise_data = np.random.uniform(
                    low=-1.0, high=1.0,
                    size=[args.batch_size, NOISE_SIZE]).astype('float32')
@@ -154,7 +163,7 @@ def train(args):
                total_images = np.concatenate([real_image, generated_images])
                fig = plot(total_images)
                msg = "Epoch ID={0}\n Batch ID={1}\n D-Loss={2}\n DG-Loss={3}\n gen={4}".format(
-                    pass_id, batch_id, d_loss_np, dg_loss_np,
+                    pass_id, batch_id, np.mean(d_loss_np), dg_loss_np,
                    check(generated_images))
                print(msg)
                plt.title(msg)

--- a/fluid/gan/c_gan/dc_gan.py
+++ b/fluid/gan/c_gan/dc_gan.py
@@ -12,11 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import sys
 import os
 import argparse
 import functools
 import matplotlib
+import six
 import numpy as np
 import paddle
 import paddle.fluid as fluid
@@ -32,15 +36,17 @@ LEARNING_RATE = 2e-4
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('batch_size',        int,   121,          "Minibatch size.")
+add_arg('batch_size',        int,   128,          "Minibatch size.")
 add_arg('epoch',             int,   20,        "The number of epoched to be trained.")
-add_arg('output',            str,   "./output", "The directory the model and the test result to be saved to.")
+add_arg('output',            str,   "./output_dcgan", "The directory the model and the test result to be saved to.")
 add_arg('use_gpu',           bool,  True,       "Whether to use GPU to train.")
 # yapf: enable


 def loss(x, label):
-    return fluid.layers.mean(x * (label - 0.5))
+    return fluid.layers.mean(
+        fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=x, label=label))


 def train(args):
@@ -63,7 +69,10 @@ def train(args):
        g_program_test = dg_program.clone(for_test=True)

        dg_logit = D(g_img)
-        dg_loss = loss(dg_logit, 1)
+        dg_loss = loss(
+            dg_logit,
+            fluid.layers.fill_constant_batch_size_like(
+                input=noise, dtype='float32', shape=[-1, 1], value=1.0))

    opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)

@@ -93,7 +102,7 @@ def train(args):
            noise_data = np.random.uniform(
                low=-1.0, high=1.0,
                size=[args.batch_size, NOISE_SIZE]).astype('float32')
-            real_image = np.array(map(lambda x: x[0], data)).reshape(
+            real_image = np.array(list(map(lambda x: x[0], data))).reshape(
                -1, 784).astype('float32')
            real_labels = np.ones(
                shape=[real_image.shape[0], 1], dtype='float32')
@@ -123,7 +132,7 @@ def train(args):

            d_loss_np = [d_loss_1[0][0], d_loss_2[0][0]]

-            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+            for _ in six.moves.xrange(NUM_TRAIN_TIMES_OF_DG):
                noise_data = np.random.uniform(
                    low=-1.0, high=1.0,
                    size=[args.batch_size, NOISE_SIZE]).astype('float32')
@@ -139,9 +148,9 @@ def train(args):
                                           fetch_list={g_img})[0]
                total_images = np.concatenate([real_image, generated_images])
                fig = plot(total_images)
-                msg = "Epoch ID={0}\n Batch ID={1}\n D-Loss={2}\n DG-Loss={3}\n gen={4}".format(
-                    pass_id, batch_id, d_loss_np, dg_loss_np,
-                    check(generated_images))
+                msg = "Epoch ID={0} Batch ID={1} D-Loss={2} DG-Loss={3}\n gen={4}".format(
+                    pass_id, batch_id,
+                    np.mean(d_loss_np), dg_loss_np, check(generated_images))
                print(msg)
                plt.title(msg)
                plt.savefig(

--- a/fluid/gan/c_gan/network.py
+++ b/fluid/gan/c_gan/network.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 from utility import get_parent_function_name
@@ -104,13 +107,13 @@ def D_cond(image, y):

 def G_cond(z, y):
    s_h, s_w = output_height, output_width
-    s_h2, s_h4 = int(s_h / 2), int(s_h / 4)
-    s_w2, s_w4 = int(s_w / 2), int(s_w / 4)
+    s_h2, s_h4 = int(s_h // 2), int(s_h // 4)
+    s_w2, s_w4 = int(s_w // 2), int(s_w // 4)

    yb = fluid.layers.reshape(y, [-1, y_dim, 1, 1])  #NCHW

    z = fluid.layers.concat([z, y], 1)
-    h0 = bn(fc(z, gfc_dim / 2), act='relu')
+    h0 = bn(fc(z, gfc_dim // 2), act='relu')
    h0 = fluid.layers.concat([h0, y], 1)

    h1 = bn(fc(h0, gf_dim * 2 * s_h4 * s_w4), act='relu')
@@ -134,8 +137,8 @@ def D(x):

 def G(x):
    x = bn(fc(x, gfc_dim))
-    x = bn(fc(x, gf_dim * 2 * img_dim / 4 * img_dim / 4))
-    x = fluid.layers.reshape(x, [-1, gf_dim * 2, img_dim / 4, img_dim / 4])
+    x = bn(fc(x, gf_dim * 2 * img_dim // 4 * img_dim // 4))
+    x = fluid.layers.reshape(x, [-1, gf_dim * 2, img_dim // 4, img_dim // 4])
    x = deconv(x, gf_dim * 2, act='relu', output_size=[14, 14])
    x = deconv(x, 1, filter_size=5, padding=2, act='tanh', output_size=[28, 28])
    x = fluid.layers.reshape(x, shape=[-1, 28 * 28])

--- a/fluid/gan/c_gan/utility.py
+++ b/fluid/gan/c_gan/utility.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import math
 import distutils.util
 import numpy as np
 import inspect
 import matplotlib
+import six
 matplotlib.use('agg')
 import matplotlib.pyplot as plt
 import matplotlib.gridspec as gridspec
@@ -54,7 +58,7 @@ def print_arguments(args):
    :type args: argparse.Namespace
    """
    print("-----------  Configuration Arguments -----------")
-    for arg, value in sorted(vars(args).iteritems()):
+    for arg, value in sorted(six.iteritems(vars(args))):
        print("%s: %s" % (arg, value))
    print("------------------------------------------------")


--- a/fluid/gan/cycle_gan/README.md
+++ b/fluid/gan/cycle_gan/README.md
@@ -21,21 +21,23 @@ TODO

 horse2zebra训练集包含1069张野马图片，1336张斑马图片。测试集包含121张野马图片和141张斑马图片。

-数据下载处理完毕后，并组织为以下路径：
+数据下载处理完毕后，并组织为以下路径结构：

 ```
-horse2zebra/
-|-- testA
-|-- testA.txt
-|-- testB
-|-- testB.txt
-|-- trainA
-|-- trainA.txt
-|-- trainB
-`-- trainB.txt
+data
+|-- horse2zebra
+|   |-- testA
+|   |-- testA.txt
+|   |-- testB
+|   |-- testB.txt
+|   |-- trainA
+|   |-- trainA.txt
+|   |-- trainB
+|   `-- trainB.txt
+
 ```

-以上数据文件中，‘testA’为存放野马测试图片的文件夹，‘testB’为存放斑马测试图片的文件夹，'testA.txt'和'testB.txt'分别为野马和斑马测试图片路径列表文件，格式如下：
+以上数据文件中，`data`文件夹需要放在训练脚本`train.py`同级目录下。`testA`为存放野马测试图片的文件夹，`testB`为存放斑马测试图片的文件夹，`testA.txt`和`testB.txt`分别为野马和斑马测试图片路径列表文件，格式如下：

 ```
 testA/n02381460_9243.jpg
@@ -53,7 +55,7 @@ testA/n02381460_9245.jpg
 在GPU单卡上训练:

 ```
-env CUDA_VISIABLE_DEVICES=0 python train.py
+env CUDA_VISIBLE_DEVICES=0 python train.py
 ```

 执行`python train.py --help`可查看更多使用方式和参数详细说明。
@@ -72,7 +74,7 @@ env CUDA_VISIABLE_DEVICES=0 python train.py

 ```
 env CUDA_VISIBLE_DEVICE=0 python infer.py \
-    --model_path="models/1" --input="./data/inputA/*" \
+    --init_model="models/1" --input="./data/inputA/*" \
    --output="./output"
 ```


--- a/fluid/gan/cycle_gan/data_reader.py
+++ b/fluid/gan/cycle_gan/data_reader.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import os
 from PIL import Image
 import numpy as np
-from itertools import izip

 A_LIST_FILE = "./data/horse2zebra/trainA.txt"
 B_LIST_FILE = "./data/horse2zebra/trainB.txt"
@@ -70,11 +72,3 @@ def b_test_reader():
    Reader of images with B style for test.
    """
    return reader_creater(B_TEST_LIST_FILE, cycle=False, return_name=True)
-
-
-if __name__ == "__main__":
-    for A, B in izip(a_test_reader()(), a_test_reader()()):
-        print A[0].shape
-        print A[1]
-        print B[0].shape
-        print B[1]
--- a/fluid/gan/cycle_gan/train.py
+++ b/fluid/gan/cycle_gan/train.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 import data_reader
 import os
 import random
@@ -9,7 +12,6 @@ import paddle.fluid as fluid
 import numpy as np
 from paddle.fluid import core
 from trainer import *
-from itertools import izip
 from scipy.misc import imsave
 import paddle.fluid.profiler as profiler
 from utility import add_arguments, print_arguments, ImagePool
@@ -66,7 +68,7 @@ def train(args):
        if not os.path.exists(out_path):
            os.makedirs(out_path)
        i = 0
-        for data_A, data_B in izip(A_test_reader(), B_test_reader()):
+        for data_A, data_B in zip(A_test_reader(), B_test_reader()):
            A_name = data_A[1]
            B_name = data_B[1]
            tensor_A = core.LoDTensor()
@@ -114,7 +116,7 @@ def train(args):
            exe, out_path + "/d_a", main_program=d_A_trainer.program)
        fluid.io.save_persistables(
            exe, out_path + "/d_b", main_program=d_B_trainer.program)
-        print "saved checkpoint to [%s]" % out_path
+        print("saved checkpoint to {}".format(out_path))
        sys.stdout.flush()

    def init_model():
@@ -128,7 +130,7 @@ def train(args):
            exe, args.init_model + "/d_a", main_program=d_A_trainer.program)
        fluid.io.load_persistables(
            exe, args.init_model + "/d_b", main_program=d_B_trainer.program)
-        print "Load model from [%s]" % args.init_model
+        print("Load model from {}".format(args.init_model))

    if args.init_model:
        init_model()
@@ -136,8 +138,8 @@ def train(args):
    for epoch in range(args.epoch):
        batch_id = 0
        for i in range(max_images_num):
-            data_A = A_reader.next()
-            data_B = B_reader.next()
+            data_A = next(A_reader)
+            data_B = next(B_reader)
            tensor_A = core.LoDTensor()
            tensor_B = core.LoDTensor()
            tensor_A.set(data_A, place)
@@ -174,9 +176,9 @@ def train(args):
                feed={"input_A": tensor_A,
                      "fake_pool_A": fake_pool_A})

-            print "epoch[%d]; batch[%d]; g_A_loss: %s; d_B_loss: %s; g_B_loss: %s; d_A_loss: %s;" % (
+            print("epoch{}; batch{}; g_A_loss: {}; d_B_loss: {}; g_B_loss: {}; d_A_loss: {};".format(
                epoch, batch_id, g_A_loss[0], d_B_loss[0], g_B_loss[0],
-                d_A_loss[0])
+                d_A_loss[0]))
            sys.stdout.flush()
            batch_id += 1


--- a/fluid/gan/cycle_gan/trainer.py
+++ b/fluid/gan/cycle_gan/trainer.py
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 from model import *
 import paddle.fluid as fluid


--- a/fluid/gan/cycle_gan/utility.py
+++ b/fluid/gan/cycle_gan/utility.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import distutils.util
+import six
 import random
 import glob
 import numpy as np
@@ -39,7 +40,7 @@ def print_arguments(args):
    :type args: argparse.Namespace
    """
    print("-----------  Configuration Arguments -----------")
-    for arg, value in sorted(vars(args).iteritems()):
+    for arg, value in sorted(six.iteritems(vars(args))):
        print("%s: %s" % (arg, value))
    print("------------------------------------------------")


--- a/fluid/icnet/infer.py
+++ b/fluid/icnet/infer.py
@@ -8,7 +8,7 @@ import os
 import cv2

 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 from icnet import icnet
 from utils import add_arguments, print_arguments, get_feeder_data
 from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
@@ -111,10 +111,10 @@ def infer(args):
    for line in open(args.images_list):
        image_file = args.images_path + "/" + line.strip()
        filename = os.path.basename(image_file)
-        image = paddle.image.load_image(
+        image = paddle.dataset.image.load_image(
            image_file, is_color=True).astype("float32")
        image -= IMG_MEAN
-        img = paddle.image.to_chw(image)[np.newaxis, :]
+        img = paddle.dataset.image.to_chw(image)[np.newaxis, :]
        image_t = fluid.core.LoDTensor()
        image_t.set(img, place)
        result = exe.run(inference_program,

--- a/fluid/image_classification/README_cn.md
+++ b/fluid/image_classification/README_cn.md
@@ -14,7 +14,7 @@

 ## 安装

-在当前目录下运行样例代码需要PadddlePaddle Fluid的v0.13.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本，请根据[安装文档](http://www.paddlepaddle.org/docs/develop/documentation/zh/build_and_install/pip_install_cn.html)中的说明来更新PaddlePaddle。
+在当前目录下运行样例代码需要PadddlePaddle Fluid的v0.13.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本，请根据安装文档中的说明来更新PaddlePaddle。

 ## 数据准备


--- a/fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py
+++ b/fluid/image_classification/caffe2fluid/examples/mnist/evaluate.py
@@ -8,7 +8,7 @@ import sys
 import os
 import numpy as np
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle


 def test_model(exe, test_program, fetch_list, test_reader, feeder):

--- a/fluid/image_classification/dist_train/README.md
+++ b/fluid/image_classification/dist_train/README.md
@@ -52,7 +52,7 @@ In this example, we launched 4 parameter server instances and 4 trainer instance
 1. launch trainer process

    ``` python
-    PADDLE_TRAINING_ROLE=PSERVER \
+    PADDLE_TRAINING_ROLE=TRAINER \
    PADDLE_TRAINERS=4 \
    PADDLE_PSERVER_IPS=192.168.0.100,192.168.0.101,192.168.0.102,192.168.0.103 \
    PADDLE_TRAINER_ID=0 \
@@ -110,4 +110,4 @@ Training acc1 curves

 ### Performance

-TBD
\ No newline at end of file
+TBD
--- a/fluid/image_classification/dist_train/dist_train.py
+++ b/fluid/image_classification/dist_train/dist_train.py
@@ -22,6 +22,7 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
+import six
 import sys
 sys.path.append("..")
 import models
@@ -172,7 +173,7 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog):

 def test_parallel(exe, test_args, args, test_prog, feeder):
    acc_evaluators = []
-    for i in xrange(len(test_args[2])):
+    for i in six.moves.xrange(len(test_args[2])):
        acc_evaluators.append(fluid.metrics.Accuracy())

    to_fetch = [v.name for v in test_args[2]]
@@ -291,7 +292,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,

 def print_arguments(args):
    print('----------- Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
+    for arg, value in sorted(six.iteritems(vars(args))):
        print('%s: %s' % (arg, value))
    print('------------------------------------------------')

@@ -307,7 +308,7 @@ def print_paddle_envs():
    print('----------- Configuration envs -----------')
    for k in os.environ:
        if "PADDLE_" in k:
-            print "ENV %s:%s" % (k, os.environ[k])
+            print("ENV %s:%s" % (k, os.environ[k]))
    print('------------------------------------------------')



--- a/fluid/image_classification/reader.py
+++ b/fluid/image_classification/reader.py
@@ -140,7 +140,7 @@ def _reader_creator(file_list,
                # distributed mode if the env var `PADDLE_TRAINING_ROLE` exits
                trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
                trainer_count = int(os.getenv("PADDLE_TRAINERS", "1"))
-                per_node_lines = len(full_lines) / trainer_count
+                per_node_lines = len(full_lines) // trainer_count
                lines = full_lines[trainer_id * per_node_lines:(trainer_id + 1)
                                   * per_node_lines]
                print(

--- a/fluid/image_classification/train.py
+++ b/fluid/image_classification/train.py
@@ -33,7 +33,7 @@ add_arg('lr',               float, 0.1,                  "set learning rate.")
 add_arg('lr_strategy',      str,   "piecewise_decay",    "Set the learning rate decay strategy.")
 add_arg('model',            str,   "SE_ResNeXt50_32x4d", "Set the network to use.")
 add_arg('enable_ce',        bool,  False,                "If set True, enable continuous evaluation job.")
-add_arg('data_dir'          str,   "./data/ILSVRC2012",  "The ImageNet dataset root dir.")
+add_arg('data_dir',         str,   "./data/ILSVRC2012",  "The ImageNet dataset root dir.")
 # yapf: enable

 model_list = [m for m in dir(models) if "__" not in m]

--- a/fluid/metric_learning/losses/datareader.py
+++ b/fluid/metric_learning/losses/datareader.py
@@ -4,7 +4,6 @@ import random
 import cPickle
 import functools
 import numpy as np
-#import paddle.v2 as paddle
 import paddle
 from PIL import Image, ImageEnhance


--- a/fluid/neural_machine_translation/transformer/.run_ce.sh
+++ b/fluid/neural_machine_translation/transformer/.run_ce.sh
 #!/bin/bash

 DATA_PATH=$HOME/.cache/paddle/dataset/wmt16
-if [ ! -d $DATA_PATH/en_10000.dict ] ; then
+if [ ! -e $DATA_PATH/en_10000.dict ] ; then
    python -c 'import paddle;paddle.dataset.wmt16.train(10000, 10000, "en")().next()'
    tar -zxf $DATA_PATH/wmt16.tar.gz -C $DATA_PATH
 fi

--- a/fluid/neural_machine_translation/transformer/README_cn.md
+++ b/fluid/neural_machine_translation/transformer/README_cn.md
@@ -63,7 +63,7 @@ WMT 数据集是机器翻译领域公认的主流数据集；WMT 英德和英法

 #### WMT 英德翻译数据

-[WMT'16 EN-DE 数据集](http://www.statmt.org/wmt16/translation-task.html)是一个中等规模的数据集。参照论文，英德数据集我们使用 BPE 编码的数据，这能够更好的解决未登录词（out-of-vocabulary，OOV）的问题[4]。用到的 BPE 数据可以参照[这里](https://github.com/google/seq2seq/blob/master/docs/data.md)进行下载（如果希望在自定义数据中使用 BPE 编码，可以参照[这里](https://github.com/rsennrich/subword-nmt)进行预处理），下载后解压，其中 `train.tok.clean.bpe.32000.en` 和 `train.tok.clean.bpe.32000.de` 为使用 BPE 的训练数据（平行语料，分别对应了英语和德语，经过了 tokenize 和 BPE 的处理），`newstest2013.tok.bpe.32000.en` 和 `newstest2013.tok.bpe.32000.de` 等为测试数据（`newstest2013.tok.en` 和 `newstest2013.tok.de` 等则为对应的未使用 BPE 的测试数据），`vocab.bpe.32000` 为相应的词典文件（源语言和目标语言共享该词典文件）。
+[WMT'16 EN-DE 数据集](http://www.statmt.org/wmt16/translation-task.html)是一个中等规模的数据集。参照论文，英德数据集我们使用 BPE 编码的数据，这能够更好的解决未登录词（out-of-vocabulary，OOV）的问题[4]。用到的 BPE 数据可以参照[这里](https://github.com/google/seq2seq/blob/master/docs/data.md)进行下载（如果希望在自定义数据中使用 BPE 编码，可以参照[这里](https://github.com/rsennrich/subword-nmt)进行预处理），下载后解压，其中 `train.tok.clean.bpe.32000.en` 和 `train.tok.clean.bpe.32000.de` 为使用 BPE 的训练数据（平行语料，分别对应了英语和德语，经过了 tokenize 和 BPE 的处理），`newstest2016.tok.bpe.32000.en` 和 `newstest2016.tok.bpe.32000.de` 等为测试数据（`newstest2016.tok.en` 和 `newstest2016.tok.de` 等则为对应的未使用 BPE 的测试数据），`vocab.bpe.32000` 为相应的词典文件（源语言和目标语言共享该词典文件）。

 由于本示例中的数据读取脚本 `reader.py` 默认使用的样本数据的格式为 `\t` 分隔的的源语言和目标语言句子对（默认句子中的词之间使用空格分隔），因此需要将源语言到目标语言的平行语料库文件合并为一个文件，可以执行以下命令进行合并：
 ```sh
@@ -91,7 +91,7 @@ python -u train.py \
  --train_file_pattern data/train.tok.clean.bpe.32000.en-de \
  --token_delimiter ' ' \
  --use_token_batch True \
-  --batch_size 3200 \
+  --batch_size 4096 \
  --sort_type pool \
  --pool_size 200000
 ```
@@ -100,7 +100,7 @@ python -u train.py \
 python train.py --help
 ```

-更多模型训练相关的参数则在 `config.py` 中的 `ModelHyperParams` 和 `TrainTaskConfig` 内定义；`ModelHyperParams` 定义了 embedding 维度等模型超参数，`TrainTaskConfig` 定义了 warmup 步数等训练需要的参数。这些参数默认使用了 Transformer 论文中 base model 的配置，如需调整可以在该脚本中进行修改。另外这些参数同样可在执行训练脚本的命令行中设置，传入的配置会合并并覆盖 `config.py` 中的配置，如可以通过以下命令来训练 Transformer 论文中的 big model ：
+更多模型训练相关的参数则在 `config.py` 中的 `ModelHyperParams` 和 `TrainTaskConfig` 内定义；`ModelHyperParams` 定义了 embedding 维度等模型超参数，`TrainTaskConfig` 定义了 warmup 步数等训练需要的参数。这些参数默认使用了 Transformer 论文中 base model 的配置，如需调整可以在该脚本中进行修改。另外这些参数同样可在执行训练脚本的命令行中设置，传入的配置会合并并覆盖 `config.py` 中的配置，如可以通过以下命令来训练 Transformer 论文中的 big model （如显存不够可适当减小 batch size 的值）：

 ```sh
 python -u train.py \
@@ -117,22 +117,23 @@ python -u train.py \
  n_head 16 \
  d_model 1024 \
  d_inner_hid 4096 \
-  dropout 0.3
+  n_head 16 \
+  prepostprocess_dropout 0.3
 ```
 有关这些参数更详细信息的请参考 `config.py` 中的注释说明。对于英法翻译数据，执行训练和英德翻译训练类似，修改命令中的词典和数据文件为英法数据相应文件的路径，另外要注意的是由于英法翻译数据 token 间不是使用空格进行分隔，需要修改 `token_delimiter` 参数的设置为 `--token_delimiter '\x01'`。

-训练时默认使用所有 GPU，可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。也可以只使用 CPU 训练(通过参数 `--divice CPU` 设置)，训练速度相对较慢。在训练过程中，每个 epoch 结束后将保存模型到参数 `model_dir` 指定的目录，每个 epoch 内也会每隔1000个 iteration 进行一次保存，每个 iteration 将打印如下的日志到标准输出：
+训练时默认使用所有 GPU，可以通过 `CUDA_VISIBLE_DEVICES` 环境变量来设置使用的 GPU 数目。也可以只使用 CPU 训练(通过参数 `--divice CPU` 设置)，训练速度相对较慢。在训练过程中，每隔一定 iteration 后(通过参数 `save_freq` 设置，默认为10000)保存模型到参数 `model_dir` 指定的目录，每个 epoch 结束后也会保存 checkpiont 到 `ckpt_dir` 指定的目录，每个 iteration 将打印如下的日志到标准输出：
 ```txt
-epoch: 0, batch: 0, sum loss: 258793.343750, avg loss: 11.069005, ppl: 64151.644531
-epoch: 0, batch: 1, sum loss: 256140.718750, avg loss: 11.059616, ppl: 63552.148438
-epoch: 0, batch: 2, sum loss: 258931.093750, avg loss: 11.064013, ppl: 63832.167969
-epoch: 0, batch: 3, sum loss: 256837.875000, avg loss: 11.058206, ppl: 63462.574219
-epoch: 0, batch: 4, sum loss: 256461.000000, avg loss: 11.053401, ppl: 63158.390625
-epoch: 0, batch: 5, sum loss: 257064.562500, avg loss: 11.019099, ppl: 61028.683594
-epoch: 0, batch: 6, sum loss: 256180.125000, avg loss: 11.008556, ppl: 60388.644531
-epoch: 0, batch: 7, sum loss: 256619.671875, avg loss: 11.007106, ppl: 60301.113281
-epoch: 0, batch: 8, sum loss: 255716.734375, avg loss: 10.966025, ppl: 57874.105469
-epoch: 0, batch: 9, sum loss: 245157.500000, avg loss: 10.966562, ppl: 57905.187500
+step_idx: 0, epoch: 0, batch: 0, avg loss: 11.059394, normalized loss: 9.682427, ppl: 63538.027344
+step_idx: 1, epoch: 0, batch: 1, avg loss: 11.053112, normalized loss: 9.676146, ppl: 63140.144531
+step_idx: 2, epoch: 0, batch: 2, avg loss: 11.054576, normalized loss: 9.677609, ppl: 63232.640625
+step_idx: 3, epoch: 0, batch: 3, avg loss: 11.046638, normalized loss: 9.669671, ppl: 62732.664062
+step_idx: 4, epoch: 0, batch: 4, avg loss: 11.030095, normalized loss: 9.653129, ppl: 61703.449219
+step_idx: 5, epoch: 0, batch: 5, avg loss: 11.047491, normalized loss: 9.670525, ppl: 62786.230469
+step_idx: 6, epoch: 0, batch: 6, avg loss: 11.044509, normalized loss: 9.667542, ppl: 62599.273438
+step_idx: 7, epoch: 0, batch: 7, avg loss: 11.011090, normalized loss: 9.634124, ppl: 60541.859375
+step_idx: 8, epoch: 0, batch: 8, avg loss: 10.985243, normalized loss: 9.608276, ppl: 58997.058594
+step_idx: 9, epoch: 0, batch: 9, avg loss: 10.993434, normalized loss: 9.616467, ppl: 59482.292969
 ```

 ### 模型预测
@@ -143,19 +144,19 @@ python -u infer.py \
  --src_vocab_fpath data/vocab.bpe.32000 \
  --trg_vocab_fpath data/vocab.bpe.32000 \
  --special_token '<s>' '<e>' '<unk>' \
-  --test_file_pattern data/newstest2013.tok.bpe.32000.en-de \
+  --test_file_pattern data/newstest2016.tok.bpe.32000.en-de \
  --use_wordpiece False \
  --token_delimiter ' ' \
-  --batch_size 4 \
-  model_path trained_models/pass_20.infer.model \
-  beam_size 5 \
-  max_out_len 256
+  --batch_size 32 \
+  model_path trained_models/iter_199999.infer.model \
+  beam_size 4 \
+  max_out_len 255
 ```
 和模型训练时类似，预测时也需要设置数据和 reader 相关的参数，并可以执行 `python infer.py --help` 查看这些参数的说明（部分参数意义和训练时略有不同）；同样可以在预测命令中设置模型超参数，但应与模型训练时的设置一致；此外相比于模型训练，预测时还有一些额外的参数，如需要设置 `model_path` 来给出模型所在目录，可以设置 `beam_size` 和 `max_out_len` 来指定 Beam Search 算法的搜索宽度和最大深度（翻译长度），这些参数也可以在 `config.py` 中的 `InferTaskConfig` 内查阅注释说明并进行更改设置。

 执行以上预测命令会打印翻译结果到标准输出，每行输出是对应行输入的得分最高的翻译。对于使用 BPE 的英德数据，预测出的翻译结果也将是 BPE 表示的数据，要还原成原始的数据（这里指 tokenize 后的数据）才能进行正确的评估，可以使用以下命令来恢复 `predict.txt` 内的翻译结果到 `predict.tok.txt` 中（无需再次 tokenize 处理）：
 ```sh
-sed 's/@@ //g' predict.txt > predict.tok.txt
+sed -r 's/(@@ )|(@@ ?$)//g' predict.txt > predict.tok.txt
 ```

 对于英法翻译的 wordpiece 数据，执行预测和英德翻译预测类似，修改命令中的词典和数据文件为英法数据相应文件的路径，另外需要注意修改 `token_delimiter` 参数的设置为 `--token_delimiter '\x01'`；同时要修改 `use_wordpiece` 参数的设置为 `--use_wordpiece True`，这会在预测时将翻译得到的 wordpiece 数据还原为原始数据输出。为了使用 tokenize 的数据进行评估，还需要对翻译结果进行 tokenize 的处理，[Moses](https://github.com/moses-smt/mosesdecoder) 提供了一系列机器翻译相关的脚本。执行 `git clone https://github.com/moses-smt/mosesdecoder.git` 克隆 mosesdecoder 仓库后，可以使用其中的 `tokenizer.perl` 脚本对 `predict.txt` 内的翻译结果进行 tokenize 处理并输出到 `predict.tok.txt` 中，如下：
@@ -163,15 +164,21 @@ sed 's/@@ //g' predict.txt > predict.tok.txt
 perl mosesdecoder/scripts/tokenizer/tokenizer.perl -l fr < predict.txt > predict.tok.txt
 ```

-接下来就可以使用参考翻译对翻译结果进行 BLEU 指标的评估了。计算 BLEU 值的脚本也在 Moses 中包含，以英德翻译 `newstest2013.tok.de` 数据为例，执行如下命令：
+接下来就可以使用参考翻译对翻译结果进行 BLEU 指标的评估了。计算 BLEU 值的脚本也在 Moses 中包含，以英德翻译 `newstest2016.tok.de` 数据为例，执行如下命令：
 ```sh
-perl mosesdecoder/scripts/generic/multi-bleu.perl data/newstest2013.tok.de < predict.tok.txt
+perl mosesdecoder/scripts/generic/multi-bleu.perl data/newstest2016.tok.de < predict.tok.txt
 ```
-可以看到类似如下的结果。
+可以看到类似如下的结果（为单机两卡训练 200K 个 iteration 后模型的预测结果）。
 ```
-BLEU = 25.08, 58.3/31.5/19.6/12.6 (BP=0.966, ratio=0.967, hyp_len=61321, ref_len=63412)
+BLEU = 33.08, 64.2/39.2/26.4/18.5 (BP=0.994, ratio=0.994, hyp_len=61971, ref_len=62362)
 ```
-目前在未使用 model average 的情况下，使用默认配置单机八卡（同论文中 base model 的配置）进行训练，英德翻译在 `newstest2013` 上测试 BLEU 值为25.，在 `newstest2014` 上测试 BLEU 值为26.；英法翻译在 `newstest2014` 上测试  BLEU 值为36.。
+目前在未使用 model average 的情况下，英德翻译 base model 八卡训练 100K 个 iteration 后测试 BLEU 值如下：
+
+| 测试集 | newstest2013 | newstest2014 | newstest2015 | newstest2016 |
+|-|-|-|-|-|
+| BLEU | 25.27 | 26.05 | 28.75 | 33.27 |
+
+英法翻译 base model 八卡训练 100K 个 iteration 后在 `newstest2014` 上测试 BLEU 值为36.。

 ### 分布式训练


--- a/fluid/neural_machine_translation/transformer/config.py
+++ b/fluid/neural_machine_translation/transformer/config.py
@@ -9,12 +9,12 @@ class TrainTaskConfig(object):
    # the hyper parameters for Adam optimizer.
    # This static learning_rate will be multiplied to the LearningRateScheduler
    # derived learning rate the to get the final learning rate.
-    learning_rate = 1
+    learning_rate = 2.0
    beta1 = 0.9
-    beta2 = 0.98
+    beta2 = 0.997
    eps = 1e-9
    # the parameters for learning rate scheduling.
-    warmup_steps = 4000
+    warmup_steps = 8000
    # the weight used to mix up the ground-truth distribution and the fixed
    # uniform distribution in label smoothing when training.
    # Set this as zero if label smoothing is not wanted.
@@ -30,6 +30,8 @@ class TrainTaskConfig(object):
    # It should be provided if use checkpoints, since the checkpoint doesn't
    # include the training step counter currently.
    start_step = 0
+    # the frequency to save trained models.
+    save_freq = 10000


 class InferTaskConfig(object):
@@ -63,7 +65,6 @@ class ModelHyperParams(object):
    # index for <unk> token
    unk_idx = 2
    # max length of sequences deciding the size of position encoding table.
-    # Start from 1 and count start and end tokens in.
    max_length = 256
    # the dimension for word embeddings, which is also the last dimension of
    # the input and output of multi-head attention, position-wise feed-forward
@@ -79,8 +80,14 @@ class ModelHyperParams(object):
    n_head = 8
    # number of sub-layers to be stacked in the encoder and decoder.
    n_layer = 6
-    # dropout rate used by all dropout layers.
-    dropout = 0.1
+    # dropout rates of different modules.
+    prepostprocess_dropout = 0.1
+    attention_dropout = 0.1
+    relu_dropout = 0.1
+    # to process before each sub-layer
+    preprocess_cmd = "n"  # layer normalization
+    # to process after each sub-layer
+    postprocess_cmd = "da"  # dropout + residual connection
    # random seed used in dropout for CE.
    dropout_seed = None
    # the flag indicating whether to share embedding and softmax weights.

--- a/fluid/neural_machine_translation/transformer/infer.py
+++ b/fluid/neural_machine_translation/transformer/infer.py
@@ -156,7 +156,9 @@ def fast_infer(test_data, trg_idx2word, use_wordpiece):
        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
        ModelHyperParams.n_head, ModelHyperParams.d_key,
        ModelHyperParams.d_value, ModelHyperParams.d_model,
-        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
+        ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout,
+        ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout,
+        ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd,
        ModelHyperParams.weight_sharing, InferTaskConfig.beam_size,
        InferTaskConfig.max_out_len, ModelHyperParams.eos_idx)

@@ -169,7 +171,7 @@ def fast_infer(test_data, trg_idx2word, use_wordpiece):
        ])

    # This is used here to set dropout to the test mode.
-    infer_program = fluid.default_main_program().inference_optimize()
+    infer_program = fluid.default_main_program().clone(for_test=True)

    for batch_id, data in enumerate(test_data.batch_generator()):
        data_input = prepare_batch_input(

--- a/fluid/neural_machine_translation/transformer/model.py
+++ b/fluid/neural_machine_translation/transformer/model.py
--- a/fluid/neural_machine_translation/transformer/profile.py
+++ b/fluid/neural_machine_translation/transformer/profile.py
-import os
-import time
 import argparse
 import ast
-import numpy as np
 import multiprocessing
+import os
+import six
+import time

-import paddle
+import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.profiler as profiler

-from train import split_data, read_multiple, prepare_batch_input
-from model import transformer, position_encoding_init
-from optim import LearningRateScheduler
-from config import *
 import reader
+from config import *
+from train import pad_batch_data, prepare_data_generator, \
+    prepare_feed_dict_list, py_reader_provider_wrapper
+from model import transformer, position_encoding_init


 def parse_args():
-    parser = argparse.ArgumentParser(
-        "Profile the training process for Transformer.")
+    parser = argparse.ArgumentParser("Training for Transformer.")
    parser.add_argument(
        "--src_vocab_fpath",
        type=str,
@@ -43,38 +42,70 @@ def parse_args():
    parser.add_argument(
        "--batch_size",
        type=int,
-        default=2048,
+        default=4096,
        help="The number of sequences contained in a mini-batch, or the maximum "
        "number of tokens (include paddings) contained in a mini-batch. Note "
        "that this represents the number on single device and the actual batch "
        "size for multi-devices will multiply the device number.")
-    parser.add_argument(
-        "--num_iters",
-        type=int,
-        default=10,
-        help="The maximum number of iterations profiling over.")
    parser.add_argument(
        "--pool_size",
        type=int,
-        default=10000,
+        default=200000,
        help="The buffer size to pool data.")
+    parser.add_argument(
+        "--sort_type",
+        default="pool",
+        choices=("global", "pool", "none"),
+        help="The grain to sort by length: global for all instances; pool for "
+        "instances in pool; none for no sort.")
+    parser.add_argument(
+        "--shuffle",
+        type=ast.literal_eval,
+        default=True,
+        help="The flag indicating whether to shuffle instances in each pass.")
+    parser.add_argument(
+        "--shuffle_batch",
+        type=ast.literal_eval,
+        default=True,
+        help="The flag indicating whether to shuffle the data batches.")
    parser.add_argument(
        "--special_token",
        type=str,
        default=["<s>", "<e>", "<unk>"],
        nargs=3,
        help="The <bos>, <eos> and <unk> tokens in the dictionary.")
+    parser.add_argument(
+        "--token_delimiter",
+        type=lambda x: str(x.encode().decode("unicode-escape")),
+        default=" ",
+        help="The delimiter used to split tokens in source or target sentences. "
+        "For EN-DE BPE data we provided, use spaces as token delimiter. "
+        "For EN-FR wordpiece data we provided, use '\x01' as token delimiter.")
+    parser.add_argument(
+        "--use_mem_opt",
+        type=ast.literal_eval,
+        default=True,
+        help="The flag indicating whether to use memory optimization.")
+    parser.add_argument(
+        "--use_py_reader",
+        type=ast.literal_eval,
+        default=True,
+        help="The flag indicating whether to use py_reader.")
+    parser.add_argument(
+        "--iter_num",
+        type=int,
+        default=20,
+        help="The iteration number to run in profiling.")
+    parser.add_argument(
+        "--use_parallel_exe",
+        type=bool,
+        default=False,
+        help="The flag indicating whether to use ParallelExecutor.")
    parser.add_argument(
        'opts',
        help='See config.py for all options',
        default=None,
        nargs=argparse.REMAINDER)
-    parser.add_argument(
-        '--device',
-        type=str,
-        default='GPU',
-        choices=['CPU', 'GPU'],
-        help="The device type.")

    args = parser.parse_args()
    # Append args related to dict
@@ -91,153 +122,147 @@ def parse_args():
    return args


-def train_loop(exe, train_progm, init, num_iters, train_data, dev_count,
-               sum_cost, avg_cost, lr_scheduler, token_num, predict):
-
-    data_input_names = encoder_data_input_fields + decoder_data_input_fields[:
-                                                                             -1] + label_data_input_fields
-
-    start_time = time.time()
-    exec_time = 0.0
-    for batch_id, data in enumerate(train_data()):
-        if batch_id >= num_iters:
-            break
-        feed_list = []
-        total_num_token = 0
-        for place_id, data_buffer in enumerate(
-                split_data(
-                    data, num_part=dev_count)):
-            data_input_dict, num_token = prepare_batch_input(
-                data_buffer, data_input_names, ModelHyperParams.eos_idx,
-                ModelHyperParams.eos_idx, ModelHyperParams.n_head,
-                ModelHyperParams.d_model)
-            total_num_token += num_token
-            feed_kv_pairs = data_input_dict.items()
-            lr_rate = lr_scheduler.update_learning_rate()
-            feed_kv_pairs += {lr_scheduler.learning_rate.name: lr_rate}.items()
-            feed_list.append(dict(feed_kv_pairs))
-
-            if not init:
-                for pos_enc_param_name in pos_enc_param_names:
-                    pos_enc = position_encoding_init(
-                        ModelHyperParams.max_length + 1,
-                        ModelHyperParams.d_model)
-                    feed_list[place_id][pos_enc_param_name] = pos_enc
-        for feed_dict in feed_list:
-            feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token
-
-        exe_start_time = time.time()
-        if dev_count > 1:
-            # prallel executor
-            outs = exe.run(fetch_list=[sum_cost.name, token_num.name],
-                           feed=feed_list)
-        else:
-            # executor
-            outs = exe.run(fetch_list=[sum_cost, token_num], feed=feed_list[0])
-        exec_time += time.time() - exe_start_time
-
-        sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1])
-        total_sum_cost = sum_cost_val.sum()  # sum the cost from multi-devices
-        total_token_num = token_num_val.sum()
-        total_avg_cost = total_sum_cost / total_token_num
-        print("batch: %d, sum loss: %f, avg loss: %f, ppl: %f" %
-              (batch_id, total_sum_cost, total_avg_cost,
-               np.exp([min(total_avg_cost, 100)])))
-        init = True
-    return time.time() - start_time, exec_time
-
-
-def profile(args):
-    print args
-
-    if args.device == 'CPU':
-        TrainTaskConfig.use_gpu = False
-
-    if not TrainTaskConfig.use_gpu:
-        place = fluid.CPUPlace()
-        dev_count = multiprocessing.cpu_count()
-    else:
+def main(args):
+    train_prog = fluid.Program()
+    startup_prog = fluid.Program()
+    with fluid.program_guard(train_prog, startup_prog):
+        with fluid.unique_name.guard():
+            sum_cost, avg_cost, predict, token_num, pyreader = transformer(
+                ModelHyperParams.src_vocab_size,
+                ModelHyperParams.trg_vocab_size,
+                ModelHyperParams.max_length + 1,
+                ModelHyperParams.n_layer,
+                ModelHyperParams.n_head,
+                ModelHyperParams.d_key,
+                ModelHyperParams.d_value,
+                ModelHyperParams.d_model,
+                ModelHyperParams.d_inner_hid,
+                ModelHyperParams.prepostprocess_dropout,
+                ModelHyperParams.attention_dropout,
+                ModelHyperParams.relu_dropout,
+                ModelHyperParams.preprocess_cmd,
+                ModelHyperParams.postprocess_cmd,
+                ModelHyperParams.weight_sharing,
+                TrainTaskConfig.label_smooth_eps,
+                use_py_reader=args.use_py_reader,
+                is_test=False)
+            lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
+                ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
+            optimizer = fluid.optimizer.Adam(
+                learning_rate=lr_decay * TrainTaskConfig.learning_rate,
+                beta1=TrainTaskConfig.beta1,
+                beta2=TrainTaskConfig.beta2,
+                epsilon=TrainTaskConfig.eps)
+            optimizer.minimize(avg_cost)
+
+    if args.use_mem_opt:
+        fluid.memory_optimize(train_prog)
+
+    if TrainTaskConfig.use_gpu:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
-
+    else:
+        place = fluid.CPUPlace()
+        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)
-
-    sum_cost, avg_cost, predict, token_num = transformer(
-        ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size,
-        ModelHyperParams.max_length + 1, ModelHyperParams.n_layer,
-        ModelHyperParams.n_head, ModelHyperParams.d_key,
-        ModelHyperParams.d_value, ModelHyperParams.d_model,
-        ModelHyperParams.d_inner_hid, ModelHyperParams.dropout,
-        ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps)
-    lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model,
-                                         TrainTaskConfig.warmup_steps,
-                                         TrainTaskConfig.learning_rate)
-
-    optimizer = fluid.optimizer.Adam(
-        learning_rate=lr_scheduler.learning_rate,
-        beta1=TrainTaskConfig.beta1,
-        beta2=TrainTaskConfig.beta2,
-        epsilon=TrainTaskConfig.eps)
-    optimizer.minimize(sum_cost)
-
    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
-        lr_scheduler.current_steps = TrainTaskConfig.start_step
    else:
-        exe.run(fluid.framework.default_startup_program())
-
-    # Disable all sorts for they will be done in the 1st batch.
-    train_data = reader.DataReader(
-        src_vocab_fpath=args.src_vocab_fpath,
-        trg_vocab_fpath=args.trg_vocab_fpath,
-        fpattern=args.train_file_pattern,
-        use_token_batch=args.use_token_batch,
-        batch_size=args.batch_size * (1 if args.use_token_batch else dev_count),
-        pool_size=args.pool_size,
-        sort_type='none',
-        shuffle=False,
-        shuffle_batch=False,
-        start_mark=args.special_token[0],
-        end_mark=args.special_token[1],
-        unk_mark=args.special_token[2],
-        # count start and end tokens out
-        max_length=ModelHyperParams.max_length - 2,
-        clip_last_batch=False)
-    train_data = read_multiple(
-        reader=train_data.batch_generator,
-        count=dev_count if args.use_token_batch else 1)
-
-    if dev_count > 1:
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
-        train_exe = fluid.ParallelExecutor(
-            use_cuda=TrainTaskConfig.use_gpu,
-            loss_name=sum_cost.name,
-            main_program=fluid.default_main_program(),
-            build_strategy=build_strategy)
-
-    print("Warming up ...")
-    train_loop(exe if dev_count == 1 else train_exe,
-               fluid.default_main_program(), False, 3, train_data, dev_count,
-               sum_cost, avg_cost, lr_scheduler, token_num, predict)
-
-    print("\nProfiling ...")
-    if dev_count == 1:
-        with profiler.profiler('All', 'total', '/tmp/profile_file'):
-            total_time, exec_time = train_loop(
-                exe,
-                fluid.default_main_program(), True, args.num_iters, train_data,
-                dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict)
+        exe.run(startup_prog)
+
+    exec_strategy = fluid.ExecutionStrategy()
+    # For faster executor
+    exec_strategy.use_experimental_executor = True
+    exec_strategy.num_iteration_per_drop_scope = 5
+    build_strategy = fluid.BuildStrategy()
+    # Since the token number differs among devices, customize gradient scale to
+    # use token average cost among multi-devices. and the gradient scale is
+    # `1 / token_number` for average cost.
+    build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
+    train_exe = fluid.ParallelExecutor(
+        use_cuda=TrainTaskConfig.use_gpu,
+        loss_name=avg_cost.name,
+        main_program=train_prog,
+        build_strategy=build_strategy,
+        exec_strategy=exec_strategy)
+
+    # the best cross-entropy value with label smoothing
+    loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log(
+        (1. - TrainTaskConfig.label_smooth_eps
+         )) + TrainTaskConfig.label_smooth_eps *
+                        np.log(TrainTaskConfig.label_smooth_eps / (
+                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))
+
+    train_data = prepare_data_generator(
+        args, is_test=False, count=dev_count, pyreader=pyreader)
+    if args.use_py_reader:
+        pyreader.start()
+        data_generator = None
    else:
-        total_time, exec_time = train_loop(
-            train_exe,
-            fluid.default_main_program(), True, args.num_iters, train_data,
-            dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict)
-    print("Elapsed time: total %f s, in executor %f s" %
-          (total_time, exec_time))
+        data_generator = train_data()
+
+    def run(iter_num):
+        reader_time = []
+        run_time = []
+
+        for step_idx in six.moves.xrange(iter_num):
+            try:
+                start_time = time.time()
+                feed_dict_list = prepare_feed_dict_list(data_generator,
+                                                        init_flag, dev_count)
+                end_time = time.time()
+                reader_time.append(end_time - start_time)
+
+                start_time = time.time()
+                if args.use_parallel_exe:
+                    outs = train_exe.run(
+                        fetch_list=[sum_cost.name, token_num.name],
+                        feed=feed_dict_list)
+                else:
+                    outs = exe.run(program=train_prog,
+                                   fetch_list=[sum_cost.name, token_num.name],
+                                   feed=feed_dict_list[0]
+                                   if feed_dict_list is not None else None)
+                end_time = time.time()
+                run_time.append(end_time - start_time)
+
+                sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[
+                    1])
+                # sum the cost from multi-devices
+                total_sum_cost = sum_cost_val.sum()
+                total_token_num = token_num_val.sum()
+                total_avg_cost = total_sum_cost / total_token_num
+                print("step_idx: %d, avg loss: %f, "
+                      "normalized loss: %f, ppl: %f" %
+                      (step_idx, total_avg_cost,
+                       total_avg_cost - loss_normalizer,
+                       np.exp([min(total_avg_cost, 100)])))
+            except (StopIteration, fluid.core.EOFException):
+                # The current pass is over.
+                if args.use_py_reader:
+                    pyreader.reset()
+                    pyreader.start()
+                break
+
+        return reader_time, run_time
+
+    # start-up
+    init_flag = True
+    run(1)
+    init_flag = False
+
+    # profiling
+    start = time.time()
+    # currently only support profiling on one device
+    with profiler.profiler('All', 'total', '/tmp/profile_file'):
+        reader_time, run_time = run(args.iter_num)
+    end = time.time()
+    total_time = end - start
+    print("Total time: {0}, reader time: {1} s, run time: {2} s".format(
+        total_time, np.sum(reader_time), np.sum(run_time)))


 if __name__ == "__main__":
    args = parse_args()
-    profile(args)
+    main(args)
--- a/fluid/neural_machine_translation/transformer/reader.py
+++ b/fluid/neural_machine_translation/transformer/reader.py
 import glob
+import six
 import os
 import tarfile

@@ -12,15 +13,16 @@ class SortType(object):


 class Converter(object):
-    def __init__(self, vocab, beg, end, unk, delimiter):
+    def __init__(self, vocab, beg, end, unk, delimiter, add_beg):
        self._vocab = vocab
        self._beg = beg
        self._end = end
        self._unk = unk
        self._delimiter = delimiter
+        self._add_beg = add_beg

    def __call__(self, sentence):
-        return [self._beg] + [
+        return ([self._beg] if self._add_beg else []) + [
            self._vocab.get(w, self._unk)
            for w in sentence.split(self._delimiter)
        ] + [self._end]
@@ -215,7 +217,8 @@ class DataReader(object):
                beg=self._src_vocab[start_mark],
                end=self._src_vocab[end_mark],
                unk=self._src_vocab[unk_mark],
-                delimiter=self._token_delimiter)
+                delimiter=self._token_delimiter,
+                add_beg=False)
        ]
        if not self._only_src:
            converters.append(
@@ -224,7 +227,8 @@ class DataReader(object):
                    beg=self._trg_vocab[start_mark],
                    end=self._trg_vocab[end_mark],
                    unk=self._trg_vocab[unk_mark],
-                    delimiter=self._token_delimiter))
+                    delimiter=self._token_delimiter,
+                    add_beg=True))

        converters = ComposedConverter(converters)

@@ -259,8 +263,10 @@ class DataReader(object):
                if not os.path.isfile(fpath):
                    raise IOError("Invalid file: %s" % fpath)

-                with open(fpath, "r") as f:
+                with open(fpath, "rb") as f:
                    for line in f:
+                        if six.PY3:
+                            line = line.decode()
                        fields = line.strip("\n").split(self._field_delimiter)
                        if (not self._only_src and len(fields) == 2) or (
                                self._only_src and len(fields) == 1):
@@ -269,8 +275,10 @@ class DataReader(object):
    @staticmethod
    def load_dict(dict_path, reverse=False):
        word_dict = {}
-        with open(dict_path, "r") as fdict:
+        with open(dict_path, "rb") as fdict:
            for idx, line in enumerate(fdict):
+                if six.PY3:
+                    line = line.decode()
                if reverse:
                    word_dict[idx] = line.strip("\n")
                else:
@@ -280,8 +288,7 @@ class DataReader(object):
    def batch_generator(self):
        # global sort or global shuffle
        if self._sort_type == SortType.GLOBAL:
-            infos = sorted(
-                self._sample_infos, key=lambda x: x.max_len, reverse=True)
+            infos = sorted(self._sample_infos, key=lambda x: x.max_len)
        else:
            if self._shuffle:
                infos = self._sample_infos

--- a/fluid/neural_machine_translation/transformer/train.py
+++ b/fluid/neural_machine_translation/transformer/train.py
--- a/fluid/object_detection/.gitignore
+++ b/fluid/object_detection/.gitignore
@@ -20,3 +20,4 @@ data/pascalvoc/trainval.txt

 log*
 *.log
+ssd_mobilenet_v1_pascalvoc*
--- a/fluid/object_detection/train.py
+++ b/fluid/object_detection/train.py
@@ -38,7 +38,8 @@ train_parameters = {
        "batch_size": 64,
        "lr": 0.001,
        "lr_epochs": [40, 60, 80, 100],
-        "lr_decay": [1, 0.5, 0.25, 0.1, 0.01]
+        "lr_decay": [1, 0.5, 0.25, 0.1, 0.01],
+        "ap_version": '11point',
    },
    "coco2014": {
        "train_images": 82783,
@@ -47,7 +48,8 @@ train_parameters = {
        "batch_size": 64,
        "lr": 0.001,
        "lr_epochs": [12, 19],
-        "lr_decay": [1, 0.5, 0.25]
+        "lr_decay": [1, 0.5, 0.25],
+        "ap_version": 'integral', # should use eval_coco_map.py to test model
    },
    "coco2017": {
        "train_images": 118287,
@@ -56,7 +58,8 @@ train_parameters = {
        "batch_size": 64,
        "lr": 0.001,
        "lr_epochs": [12, 19],
-        "lr_decay": [1, 0.5, 0.25]
+        "lr_decay": [1, 0.5, 0.25],
+        "ap_version": 'integral', # should use eval_coco_map.py to test model
    }
 }

@@ -77,6 +80,7 @@ def optimizer_setting(train_params):
 def build_program(main_prog, startup_prog, train_params, is_train):
    image_shape = train_params['image_shape']
    class_num = train_params['class_num']
+    ap_version = train_params['ap_version']
    with fluid.program_guard(main_prog, startup_prog):
        py_reader = fluid.layers.py_reader(
            capacity=64,
@@ -97,16 +101,15 @@ def build_program(main_prog, startup_prog, train_params, is_train):

                nmsed_out = fluid.layers.detection_output(
                    locs, confs, box, box_var, nms_threshold=0.45)
-                with fluid.program_guard(main_prog):
-                    loss = fluid.evaluator.DetectionMAP(
-                        nmsed_out,
-                        gt_label,
-                        gt_box,
-                        difficult,
-                        class_num,
-                        overlap_threshold=0.5,
-                        evaluate_difficult=False,
-                        ap_version=args.ap_version)
+                loss = fluid.evaluator.DetectionMAP(
+                    nmsed_out,
+                    gt_label,
+                    gt_box,
+                    difficult,
+                    class_num,
+                    overlap_threshold=0.5,
+                    evaluate_difficult=False,
+                    ap_version=ap_version)
    return py_reader, loss


@@ -126,7 +129,7 @@ def train(args,
    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    batch_size = train_params['batch_size']
-    epoc_num = train_params['epoch_num']
+    epoc_num = train_params['epoc_num']
    batch_size_per_device = batch_size // devices_num
    iters_per_epoc = train_params["train_images"] // batch_size
    num_workers = 8
@@ -230,7 +233,7 @@ def train(args,
                loss_v = np.mean(np.array(loss_v))
                every_epoc_loss.append(loss_v)
                if batch_id % 20 == 0:
-                    print("Epoc {0}, batch {1}, loss {2}, time {3}".format(
+                    print("Epoc {:d}, batch {:d}, loss {:.6f}, time {:.5f}".format(
                        epoc_id, batch_id, loss_v, start_time - prev_start_time))
            end_time = time.time()
            total_time += end_time - start_time

--- a/fluid/ocr_recognition/attention_model.py
+++ b/fluid/ocr_recognition/attention_model.py
@@ -2,6 +2,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 import paddle.fluid as fluid
+import six

 decoder_size = 128
 word_vector_dim = 128
@@ -22,7 +23,7 @@ def conv_bn_pool(input,
                 pool=True,
                 use_cudnn=True):
    tmp = input
-    for i in xrange(group):
+    for i in six.moves.xrange(group):
        filter_size = 3
        conv_std = (2.0 / (filter_size**2 * tmp.shape[1]))**0.5
        conv_param = fluid.ParamAttr(

--- a/fluid/ocr_recognition/eval.py
+++ b/fluid/ocr_recognition/eval.py
-import paddle.v2 as paddle
 import paddle.fluid as fluid
 from utility import add_arguments, print_arguments, to_lodtensor, get_ctc_feeder_data, get_attention_feeder_data
 from attention_model import attention_eval

--- a/fluid/ocr_recognition/infer.py
+++ b/fluid/ocr_recognition/infer.py
 from __future__ import print_function
-import paddle.v2 as paddle
 import paddle.fluid as fluid
 from utility import add_arguments, print_arguments, to_lodtensor, get_ctc_feeder_data, get_attention_feeder_for_infer
 import paddle.fluid.profiler as profiler

--- a/fluid/policy_gradient/brain.py
+++ b/fluid/policy_gradient/brain.py
 import numpy as np
-import paddle.v2 as paddle
 import paddle.fluid as fluid
 # reproducible
 np.random.seed(1)

--- a/fluid/video_classification/README.md
+++ b/fluid/video_classification/README.md
@@ -111,7 +111,6 @@ According to the congfiguration of evaluation, the output log is like:
 Inference is used to get prediction score or video features based on trained models.
 ```
 python infer.py \
-    --batch_size=128 \
    --class_dim=101 \
    --image_shape=3,224,224 \
    --with_mem_opt=True \

--- a/fluid/video_classification/data/generate_train_data.py
+++ b/fluid/video_classification/data/generate_train_data.py
@@ -9,27 +9,33 @@ for line in f.readlines():
    dd[name.lower()] = int(label) - 1
 f.close()

-# generate pkl
-path = 'train/'
-savepath = 'train_pkl/'
-if not os.path.exists(savepath):
-    os.makedirs(savepath)
-
-fw = open('train.list', 'w')
-for folder in os.listdir(path):
-    vidid = folder.split('_', 1)[1]
-    this_label = dd[folder.split('_')[1].lower()]
-    this_feat = []
-    for img in sorted(os.listdir(path + folder)):
-        fout = open(path + folder + '/' + img, 'rb')
-        this_feat.append(fout.read())
-        fout.close()
-
-    res = [vidid, this_label, this_feat]
-
-    outp = open(savepath + vidid + '.pkl', 'wb')
-    cPickle.dump(res, outp, protocol=cPickle.HIGHEST_PROTOCOL)
-    outp.close()
-
-    fw.write('data/train_pkl/%s.pkl\n' % vidid)
-fw.close()
+
+def generate_pkl(mode):
+    # generate pkl
+    path = '%s/' % mode
+    savepath = '%s_pkl/' % mode
+    if not os.path.exists(savepath):
+        os.makedirs(savepath)
+
+    fw = open('%s.list' % mode, 'w')
+    for folder in os.listdir(path):
+        vidid = folder.split('_', 1)[1]
+        this_label = dd[folder.split('_')[1].lower()]
+        this_feat = []
+        for img in sorted(os.listdir(path + folder)):
+            fout = open(path + folder + '/' + img, 'rb')
+            this_feat.append(fout.read())
+            fout.close()
+
+        res = [vidid, this_label, this_feat]
+
+        outp = open(savepath + vidid + '.pkl', 'wb')
+        cPickle.dump(res, outp, protocol=cPickle.HIGHEST_PROTOCOL)
+        outp.close()
+
+        fw.write('data/%s/%s.pkl\n' % (savepath, vidid))
+    fw.close()
+
+
+generate_pkl('train')
+generate_pkl('test')
--- a/fluid/video_classification/eval.py
+++ b/fluid/video_classification/eval.py
@@ -2,7 +2,7 @@ import os
 import numpy as np
 import time
 import sys
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 from resnet import TSN_ResNet
 import reader

--- a/fluid/video_classification/infer.py
+++ b/fluid/video_classification/infer.py
@@ -2,7 +2,7 @@ import os
 import numpy as np
 import time
 import sys
-import paddle.v2 as paddle
+import paddle
 import paddle.fluid as fluid
 from resnet import TSN_ResNet
 import reader

--- a/fluid/video_classification/reader.py
+++ b/fluid/video_classification/reader.py
@@ -5,7 +5,7 @@ import functools
 import cPickle
 from cStringIO import StringIO
 import numpy as np
-import paddle.v2 as paddle
+import paddle
 from PIL import Image, ImageEnhance

 random.seed(0)
@@ -16,8 +16,8 @@ THREAD = 8
 BUF_SIZE = 1024

 TRAIN_LIST = 'data/train.list'
-TEST_LIST = 'data/val.list'
-INFER_LIST = 'data/val.list'
+TEST_LIST = 'data/test.list'
+INFER_LIST = 'data/test.list'

 img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
 img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))

--- a/fluid/video_classification/train.py
+++ b/fluid/video_classification/train.py
@@ -2,6 +2,7 @@ import os
 import numpy as np
 import time
 import sys
+import paddle
 import paddle.fluid as fluid
 from resnet import TSN_ResNet
 import reader

--- a/v2/README.cn.md
+++ b/v2/README.cn.md
--- a/v2/README.md
+++ b/v2/README.md
@@ -12,23 +12,23 @@ The word embedding expresses words with a real vector. Each dimension of the vec

 In the example of word vectors, we show how to use Hierarchical-Sigmoid and Noise Contrastive Estimation (NCE) to accelerate word-vector learning.

- 1.1 [Hsigmoid Accelerated Word Vector Training](https://github.com/PaddlePaddle/models/tree/develop/v2/hsigmoid)
- 1.2 [Noise Contrastive Estimation Accelerated Word Vector Training](https://github.com/PaddlePaddle/models/tree/develop/v2/nce_cost)
+- 1.1 [Hsigmoid Accelerated Word Vector Training](https://github.com/PaddlePaddle/models/tree/develop/legacy/hsigmoid)
+- 1.2 [Noise Contrastive Estimation Accelerated Word Vector Training](https://github.com/PaddlePaddle/models/tree/develop/legacy/nce_cost)


 ## 2. RNN language model

 The language model is important in the field of natural language processing. In addition to getting the word vector (a by-product of language model training), it can also help us to generate text. Given a number of words, the language model can help us predict the next most likely word. In the example of using the language model to generate text, we focus on the recurrent neural network language model. We can use the instructions in the document quickly adapt to their training corpus, complete automatic writing poetry, automatic writing prose and other interesting models.

- 2.1 [Generate text using the RNN language model](https://github.com/PaddlePaddle/models/tree/develop/v2/generate_sequence_by_rnn_lm)
+- 2.1 [Generate text using the RNN language model](https://github.com/PaddlePaddle/models/tree/develop/legacy/generate_sequence_by_rnn_lm)

 ## 3. Click-Through Rate prediction
 The click-through rate model predicts the probability that a user will click on an ad. This is widely used for advertising technology. Logistic Regression has a good learning performance for large-scale sparse features in the early stages of the development of click-through rate prediction. In recent years, DNN model because of its strong learning ability to gradually take the banner rate of the task of the banner.

 In the example of click-through rate estimates, we first give the Google's Wide & Deep model. This model combines the advantages of DNN and the applicable logistic regression model for DNN and large-scale sparse features. Then we provide the deep factorization machine for click-through rate prediction. The deep factorization machine combines the factorization machine and deep neural networks to model both low order and high order interactions of input features.

- 3.1 [Click-Through Rate Model](https://github.com/PaddlePaddle/models/tree/develop/v2/ctr)
- 3.2 [Deep Factorization Machine for Click-Through Rate prediction](https://github.com/PaddlePaddle/models/tree/develop/v2/deep_fm)
+- 3.1 [Click-Through Rate Model](https://github.com/PaddlePaddle/models/tree/develop/legacy/ctr)
+- 3.2 [Deep Factorization Machine for Click-Through Rate prediction](https://github.com/PaddlePaddle/models/tree/develop/legacy/deep_fm)

 ## 4. Text classification

@@ -36,7 +36,7 @@ Text classification is one of the most basic tasks in natural language processin

 For text classification, we provide a non-sequential text classification model based on DNN and CNN. (For LSTM-based model, please refer to PaddleBook [Sentiment Analysis](http://www.paddlepaddle.org/docs/develop/book/06.understand_sentiment/index.html)).

- 4.1 [Sentiment analysis based on DNN / CNN](https://github.com/PaddlePaddle/models/tree/develop/v2/text_classification)
+- 4.1 [Sentiment analysis based on DNN / CNN](https://github.com/PaddlePaddle/models/tree/develop/legacy/text_classification)

 ## 5. Learning to rank

@@ -45,14 +45,14 @@ The depth neural network can be used to model the fractional function to form va

 The algorithms for learning to rank are usually categorized into three groups by their input representation and the loss function. These are pointwise, pairwise and listwise approaches. Here we demonstrate RankLoss loss function method (pairwise approach), and LambdaRank loss function method (listwise approach). (For Pointwise approaches, please refer to [Recommended System](http://www.paddlepaddle.org/docs/develop/book/05.recommender_system/index.html)).

- 5.1 [Learning to rank based on Pairwise and Listwise approches](https://github.com/PaddlePaddle/models/tree/develop/v2/ltr)
+- 5.1 [Learning to rank based on Pairwise and Listwise approches](https://github.com/PaddlePaddle/models/tree/develop/legacy/ltr)

 ## 6. Semantic model
 The deep structured semantic model uses the DNN model to learn the vector representation of the low latitude in a continuous semantic space, finally models the semantic similarity between the two sentences.

 In this example, we demonstrate how to use PaddlePaddle to implement a generic deep structured semantic model to model the semantic similarity between two strings. The model supports different network structures such as CNN (Convolutional Network), FC (Fully Connected Network), RNN (Recurrent Neural Network), and different loss functions such as classification, regression, and sequencing.

- 6.1 [Deep structured semantic model](https://github.com/PaddlePaddle/models/tree/develop/v2/dssm)
+- 6.1 [Deep structured semantic model](https://github.com/PaddlePaddle/models/tree/develop/legacy/dssm)

 ## 7. Sequence tagging

@@ -60,7 +60,7 @@ Given the input sequence, the sequence tagging model is one of the most basic ta

 In the example of the sequence tagging, we describe how to train an end-to-end sequence tagging model with the Named Entity Recognition (NER) task as an example.

- 7.1 [Name Entity Recognition](https://github.com/PaddlePaddle/models/tree/develop/v2/sequence_tagging_for_ner)
+- 7.1 [Name Entity Recognition](https://github.com/PaddlePaddle/models/tree/develop/legacy/sequence_tagging_for_ner)

 ## 8. Sequence to sequence learning

@@ -68,19 +68,19 @@ Sequence-to-sequence model has a wide range of applications. This includes machi

 As an example for sequence-to-sequence learning, we take the machine translation task. We demonstrate the sequence-to-sequence mapping model without attention mechanism, which is the basis for all sequence-to-sequence learning models. We will use scheduled sampling to improve the problem of error accumulation in the RNN model, and machine translation with external memory mechanism.

- 8.1 [Basic Sequence-to-sequence model](https://github.com/PaddlePaddle/models/tree/develop/v2/nmt_without_attention)
+- 8.1 [Basic Sequence-to-sequence model](https://github.com/PaddlePaddle/models/tree/develop/legacy/nmt_without_attention)

 ## 9. Image classification

 For the example of image classification, we show you how to train AlexNet, VGG, GoogLeNet, ResNet, Inception-v4, Inception-Resnet-V2 and Xception models in PaddlePaddle. It also provides model conversion tools that convert Caffe or TensorFlow trained model files into PaddlePaddle model files.

- 9.1 [convert Caffe model file to PaddlePaddle model file](https://github.com/PaddlePaddle/models/tree/develop/v2/image_classification/caffe2paddle)
- 9.2 [convert TensorFlow model file to PaddlePaddle model file](https://github.com/PaddlePaddle/models/tree/develop/v2/image_classification/tf2paddle)
- 9.3 [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/v2/image_classification)
- 9.4 [VGG](https://github.com/PaddlePaddle/models/tree/develop/v2/image_classification)
- 9.5 [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/v2/image_classification)
- 9.6 [Inception-v4](https://github.com/PaddlePaddle/models/tree/develop/v2/image_classification)
- 9.7 [Inception-Resnet-V2](https://github.com/PaddlePaddle/models/tree/develop/v2/image_classification)
- 9.8 [Xception](https://github.com/PaddlePaddle/models/tree/develop/v2/image_classification)
+- 9.1 [convert Caffe model file to PaddlePaddle model file](https://github.com/PaddlePaddle/models/tree/develop/legacy/image_classification/caffe2paddle)
+- 9.2 [convert TensorFlow model file to PaddlePaddle model file](https://github.com/PaddlePaddle/models/tree/develop/legacy/image_classification/tf2paddle)
+- 9.3 [AlexNet](https://github.com/PaddlePaddle/models/tree/develop/legacy/image_classification)
+- 9.4 [VGG](https://github.com/PaddlePaddle/models/tree/develop/legacy/image_classification)
+- 9.5 [Residual Network](https://github.com/PaddlePaddle/models/tree/develop/legacy/image_classification)
+- 9.6 [Inception-v4](https://github.com/PaddlePaddle/models/tree/develop/legacy/image_classification)
+- 9.7 [Inception-Resnet-V2](https://github.com/PaddlePaddle/models/tree/develop/legacy/image_classification)
+- 9.8 [Xception](https://github.com/PaddlePaddle/models/tree/develop/legacy/image_classification)

 This tutorial is contributed by [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) and licensed under the [Apache-2.0 license](LICENSE).
--- a/v2/conv_seq2seq/README.md
+++ b/v2/conv_seq2seq/README.md
--- a/v2/conv_seq2seq/beamsearch.py
+++ b/v2/conv_seq2seq/beamsearch.py
--- a/v2/conv_seq2seq/download.sh
+++ b/v2/conv_seq2seq/download.sh
--- a/v2/conv_seq2seq/infer.py
+++ b/v2/conv_seq2seq/infer.py
--- a/v2/conv_seq2seq/model.py
+++ b/v2/conv_seq2seq/model.py
--- a/v2/conv_seq2seq/preprocess.py
+++ b/v2/conv_seq2seq/preprocess.py
--- a/v2/conv_seq2seq/reader.py
+++ b/v2/conv_seq2seq/reader.py
--- a/v2/conv_seq2seq/train.py
+++ b/v2/conv_seq2seq/train.py
--- a/v2/ctr/README.cn.md
+++ b/v2/ctr/README.cn.md
--- a/v2/ctr/README.md
+++ b/v2/ctr/README.md
--- a/v2/ctr/avazu_data_processer.py
+++ b/v2/ctr/avazu_data_processer.py
--- a/v2/ctr/dataset.md
+++ b/v2/ctr/dataset.md
--- a/v2/ctr/images/lr_vs_dnn.jpg
+++ b/v2/ctr/images/lr_vs_dnn.jpg
--- a/v2/ctr/images/wide_deep.png
+++ b/v2/ctr/images/wide_deep.png
--- a/v2/ctr/infer.py
+++ b/v2/ctr/infer.py
--- a/v2/ctr/network_conf.py
+++ b/v2/ctr/network_conf.py
--- a/v2/ctr/reader.py
+++ b/v2/ctr/reader.py
--- a/v2/ctr/train.py
+++ b/v2/ctr/train.py
--- a/v2/ctr/utils.py
+++ b/v2/ctr/utils.py
--- a/v2/deep_fm/README.cn.md
+++ b/v2/deep_fm/README.cn.md
--- a/v2/deep_fm/README.md
+++ b/v2/deep_fm/README.md
--- a/v2/deep_fm/data/download.sh
+++ b/v2/deep_fm/data/download.sh
--- a/v2/deep_fm/infer.py
+++ b/v2/deep_fm/infer.py
--- a/v2/deep_fm/network_conf.py
+++ b/v2/deep_fm/network_conf.py
--- a/v2/deep_fm/preprocess.py
+++ b/v2/deep_fm/preprocess.py
--- a/v2/deep_fm/reader.py
+++ b/v2/deep_fm/reader.py
--- a/v2/deep_fm/train.py
+++ b/v2/deep_fm/train.py
--- a/v2/dssm/README.cn.md
+++ b/v2/dssm/README.cn.md
--- a/v2/dssm/README.md
+++ b/v2/dssm/README.md
--- a/v2/dssm/data/classification/test.txt
+++ b/v2/dssm/data/classification/test.txt
--- a/v2/dssm/data/classification/train.txt
+++ b/v2/dssm/data/classification/train.txt
--- a/v2/dssm/data/rank/test.txt
+++ b/v2/dssm/data/rank/test.txt
--- a/v2/dssm/data/rank/train.txt
+++ b/v2/dssm/data/rank/train.txt
--- a/v2/dssm/data/vocab.txt
+++ b/v2/dssm/data/vocab.txt
--- a/v2/dssm/images/dssm.jpg
+++ b/v2/dssm/images/dssm.jpg
--- a/v2/dssm/images/dssm.png
+++ b/v2/dssm/images/dssm.png
--- a/v2/dssm/images/dssm2.jpg
+++ b/v2/dssm/images/dssm2.jpg
--- a/v2/dssm/images/dssm2.png
+++ b/v2/dssm/images/dssm2.png
--- a/v2/dssm/images/dssm3.jpg
+++ b/v2/dssm/images/dssm3.jpg
--- a/v2/dssm/infer.py
+++ b/v2/dssm/infer.py
--- a/v2/dssm/network_conf.py
+++ b/v2/dssm/network_conf.py
--- a/v2/dssm/reader.py
+++ b/v2/dssm/reader.py
--- a/v2/dssm/train.py
+++ b/v2/dssm/train.py
--- a/v2/dssm/utils.py
+++ b/v2/dssm/utils.py
--- a/v2/generate_chinese_poetry/README.md
+++ b/v2/generate_chinese_poetry/README.md
--- a/v2/generate_chinese_poetry/README_en.md
+++ b/v2/generate_chinese_poetry/README_en.md
--- a/v2/generate_chinese_poetry/data/download.sh
+++ b/v2/generate_chinese_poetry/data/download.sh
--- a/v2/generate_chinese_poetry/generate.py
+++ b/v2/generate_chinese_poetry/generate.py
--- a/v2/generate_chinese_poetry/network_conf.py
+++ b/v2/generate_chinese_poetry/network_conf.py
--- a/v2/generate_chinese_poetry/preprocess.py
+++ b/v2/generate_chinese_poetry/preprocess.py
--- a/v2/generate_chinese_poetry/reader.py
+++ b/v2/generate_chinese_poetry/reader.py
--- a/v2/generate_chinese_poetry/train.py
+++ b/v2/generate_chinese_poetry/train.py
--- a/v2/generate_chinese_poetry/utils.py
+++ b/v2/generate_chinese_poetry/utils.py
--- a/v2/generate_sequence_by_rnn_lm/.gitignore
+++ b/v2/generate_sequence_by_rnn_lm/.gitignore
--- a/v2/generate_sequence_by_rnn_lm/README.md
+++ b/v2/generate_sequence_by_rnn_lm/README.md
--- a/v2/generate_sequence_by_rnn_lm/beam_search.py
+++ b/v2/generate_sequence_by_rnn_lm/beam_search.py
--- a/v2/generate_sequence_by_rnn_lm/config.py
+++ b/v2/generate_sequence_by_rnn_lm/config.py
--- a/v2/generate_sequence_by_rnn_lm/data/train_data_examples.txt
+++ b/v2/generate_sequence_by_rnn_lm/data/train_data_examples.txt
--- a/v2/generate_sequence_by_rnn_lm/generate.py
+++ b/v2/generate_sequence_by_rnn_lm/generate.py
--- a/v2/generate_sequence_by_rnn_lm/images/ngram.png
+++ b/v2/generate_sequence_by_rnn_lm/images/ngram.png
--- a/v2/generate_sequence_by_rnn_lm/images/rnn.png
+++ b/v2/generate_sequence_by_rnn_lm/images/rnn.png
--- a/v2/generate_sequence_by_rnn_lm/network_conf.py
+++ b/v2/generate_sequence_by_rnn_lm/network_conf.py
--- a/v2/generate_sequence_by_rnn_lm/reader.py
+++ b/v2/generate_sequence_by_rnn_lm/reader.py
--- a/v2/generate_sequence_by_rnn_lm/train.py
+++ b/v2/generate_sequence_by_rnn_lm/train.py
--- a/v2/generate_sequence_by_rnn_lm/utils.py
+++ b/v2/generate_sequence_by_rnn_lm/utils.py
--- a/v2/globally_normalized_reader/.gitignore
+++ b/v2/globally_normalized_reader/.gitignore
--- a/v2/globally_normalized_reader/README.cn.md
+++ b/v2/globally_normalized_reader/README.cn.md
--- a/v2/globally_normalized_reader/README.md
+++ b/v2/globally_normalized_reader/README.md
--- a/v2/globally_normalized_reader/basic_modules.py
+++ b/v2/globally_normalized_reader/basic_modules.py
--- a/v2/globally_normalized_reader/beam_decoding.py
+++ b/v2/globally_normalized_reader/beam_decoding.py
--- a/v2/globally_normalized_reader/config.py
+++ b/v2/globally_normalized_reader/config.py
--- a/v2/globally_normalized_reader/data/download.sh
+++ b/v2/globally_normalized_reader/data/download.sh
--- a/v2/globally_normalized_reader/evaluate.py
+++ b/v2/globally_normalized_reader/evaluate.py
--- a/v2/globally_normalized_reader/featurize.py
+++ b/v2/globally_normalized_reader/featurize.py
--- a/v2/globally_normalized_reader/infer.py
+++ b/v2/globally_normalized_reader/infer.py
--- a/v2/globally_normalized_reader/model.py
+++ b/v2/globally_normalized_reader/model.py
--- a/v2/globally_normalized_reader/reader.py
+++ b/v2/globally_normalized_reader/reader.py
--- a/v2/globally_normalized_reader/train.py
+++ b/v2/globally_normalized_reader/train.py
--- a/v2/globally_normalized_reader/vocab.py
+++ b/v2/globally_normalized_reader/vocab.py
--- a/v2/hsigmoid/.gitignore
+++ b/v2/hsigmoid/.gitignore
--- a/v2/hsigmoid/README.md
+++ b/v2/hsigmoid/README.md
--- a/v2/hsigmoid/images/binary_tree.png
+++ b/v2/hsigmoid/images/binary_tree.png
--- a/v2/hsigmoid/images/network_conf.png
+++ b/v2/hsigmoid/images/network_conf.png
--- a/v2/hsigmoid/images/path_to_1.png
+++ b/v2/hsigmoid/images/path_to_1.png
--- a/v2/hsigmoid/infer.py
+++ b/v2/hsigmoid/infer.py
--- a/v2/hsigmoid/network_conf.py
+++ b/v2/hsigmoid/network_conf.py
--- a/v2/hsigmoid/train.py
+++ b/v2/hsigmoid/train.py
--- a/v2/image_classification/README.md
+++ b/v2/image_classification/README.md
--- a/v2/image_classification/alexnet.py
+++ b/v2/image_classification/alexnet.py
--- a/v2/image_classification/caffe2paddle/README.md
+++ b/v2/image_classification/caffe2paddle/README.md
--- a/v2/image_classification/caffe2paddle/caffe2paddle.py
+++ b/v2/image_classification/caffe2paddle/caffe2paddle.py
--- a/v2/image_classification/googlenet.py
+++ b/v2/image_classification/googlenet.py
--- a/v2/image_classification/inception_resnet_v2.py
+++ b/v2/image_classification/inception_resnet_v2.py
--- a/v2/image_classification/inception_v4.py
+++ b/v2/image_classification/inception_v4.py
--- a/v2/image_classification/infer.py
+++ b/v2/image_classification/infer.py
--- a/v2/image_classification/models/model_download.sh
+++ b/v2/image_classification/models/model_download.sh
--- a/v2/image_classification/reader.py
+++ b/v2/image_classification/reader.py
--- a/v2/image_classification/resnet.py
+++ b/v2/image_classification/resnet.py
--- a/v2/image_classification/tf2paddle/README.md
+++ b/v2/image_classification/tf2paddle/README.md
--- a/v2/image_classification/tf2paddle/tf2paddle.py
+++ b/v2/image_classification/tf2paddle/tf2paddle.py
--- a/v2/image_classification/train.py
+++ b/v2/image_classification/train.py
--- a/v2/image_classification/vgg.py
+++ b/v2/image_classification/vgg.py
--- a/v2/image_classification/xception.py
+++ b/v2/image_classification/xception.py
--- a/v2/ltr/README.md
+++ b/v2/ltr/README.md
--- a/v2/ltr/README_en.md
+++ b/v2/ltr/README_en.md
--- a/v2/ltr/images/LambdaRank_EN.png
+++ b/v2/ltr/images/LambdaRank_EN.png
--- a/v2/ltr/images/lambdarank.jpg
+++ b/v2/ltr/images/lambdarank.jpg
--- a/v2/ltr/images/learning_to_rank.jpg
+++ b/v2/ltr/images/learning_to_rank.jpg
--- a/v2/ltr/images/ranknet.jpg
+++ b/v2/ltr/images/ranknet.jpg
--- a/v2/ltr/images/ranknet_en.png
+++ b/v2/ltr/images/ranknet_en.png
--- a/v2/ltr/images/search_engine_example.png
+++ b/v2/ltr/images/search_engine_example.png
--- a/v2/ltr/infer.py
+++ b/v2/ltr/infer.py
--- a/v2/ltr/lambda_rank.py
+++ b/v2/ltr/lambda_rank.py
--- a/v2/ltr/ranknet.py
+++ b/v2/ltr/ranknet.py
--- a/v2/ltr/train.py
+++ b/v2/ltr/train.py
--- a/v2/mt_with_external_memory/README.md
+++ b/v2/mt_with_external_memory/README.md
--- a/v2/mt_with_external_memory/data_utils.py
+++ b/v2/mt_with_external_memory/data_utils.py
--- a/v2/mt_with_external_memory/external_memory.py
+++ b/v2/mt_with_external_memory/external_memory.py
--- a/v2/mt_with_external_memory/image/lstm_c_state.png
+++ b/v2/mt_with_external_memory/image/lstm_c_state.png
--- a/v2/mt_with_external_memory/image/memory_enhanced_decoder.png
+++ b/v2/mt_with_external_memory/image/memory_enhanced_decoder.png
--- a/v2/mt_with_external_memory/image/neural_turing_machine_arch.png
+++ b/v2/mt_with_external_memory/image/neural_turing_machine_arch.png
--- a/v2/mt_with_external_memory/image/turing_machine_cartoon.gif
+++ b/v2/mt_with_external_memory/image/turing_machine_cartoon.gif
--- a/v2/mt_with_external_memory/infer.py
+++ b/v2/mt_with_external_memory/infer.py
--- a/v2/mt_with_external_memory/model.py
+++ b/v2/mt_with_external_memory/model.py
--- a/v2/mt_with_external_memory/train.py
+++ b/v2/mt_with_external_memory/train.py
--- a/v2/nce_cost/.gitignore
+++ b/v2/nce_cost/.gitignore
--- a/v2/nce_cost/README.md
+++ b/v2/nce_cost/README.md
--- a/v2/nce_cost/images/network_conf.png
+++ b/v2/nce_cost/images/network_conf.png
--- a/v2/nce_cost/infer.py
+++ b/v2/nce_cost/infer.py
--- a/v2/nce_cost/network_conf.py
+++ b/v2/nce_cost/network_conf.py
--- a/v2/nce_cost/train.py
+++ b/v2/nce_cost/train.py
--- a/v2/nested_sequence/README.md
+++ b/v2/nested_sequence/README.md
--- a/v2/nested_sequence/README_en.md
+++ b/v2/nested_sequence/README_en.md
--- a/v2/nested_sequence/text_classification/.gitignore
+++ b/v2/nested_sequence/text_classification/.gitignore
--- a/v2/nested_sequence/text_classification/README.md
+++ b/v2/nested_sequence/text_classification/README.md
--- a/v2/nested_sequence/text_classification/README_en.md
+++ b/v2/nested_sequence/text_classification/README_en.md
--- a/v2/nested_sequence/text_classification/config.py
+++ b/v2/nested_sequence/text_classification/config.py
--- a/v2/nested_sequence/text_classification/data/infer.txt
+++ b/v2/nested_sequence/text_classification/data/infer.txt
--- a/v2/nested_sequence/text_classification/data/test_data/test.txt
+++ b/v2/nested_sequence/text_classification/data/test_data/test.txt
--- a/v2/nested_sequence/text_classification/data/train_data/train.txt
+++ b/v2/nested_sequence/text_classification/data/train_data/train.txt
--- a/v2/nested_sequence/text_classification/images/model.jpg
+++ b/v2/nested_sequence/text_classification/images/model.jpg
--- a/v2/nested_sequence/text_classification/infer.py
+++ b/v2/nested_sequence/text_classification/infer.py
--- a/v2/nested_sequence/text_classification/network_conf.py
+++ b/v2/nested_sequence/text_classification/network_conf.py
--- a/v2/nested_sequence/text_classification/reader.py
+++ b/v2/nested_sequence/text_classification/reader.py
--- a/v2/nested_sequence/text_classification/requirements.txt
+++ b/v2/nested_sequence/text_classification/requirements.txt
--- a/v2/nested_sequence/text_classification/train.py
+++ b/v2/nested_sequence/text_classification/train.py
--- a/v2/nested_sequence/text_classification/utils.py
+++ b/v2/nested_sequence/text_classification/utils.py
--- a/v2/neural_qa/.gitignore
+++ b/v2/neural_qa/.gitignore
--- a/v2/neural_qa/README.md
+++ b/v2/neural_qa/README.md
--- a/v2/neural_qa/config.py
+++ b/v2/neural_qa/config.py
--- a/v2/neural_qa/infer.py
+++ b/v2/neural_qa/infer.py
--- a/v2/neural_qa/network.py
+++ b/v2/neural_qa/network.py
--- a/v2/neural_qa/pre-trained-models/download-models.sh
+++ b/v2/neural_qa/pre-trained-models/download-models.sh
--- a/v2/neural_qa/pre-trained-models/neural_seq_qa.pre-trained-models.2017-10-27.tar.gz.md5
+++ b/v2/neural_qa/pre-trained-models/neural_seq_qa.pre-trained-models.2017-10-27.tar.gz.md5
--- a/v2/neural_qa/reader.py
+++ b/v2/neural_qa/reader.py
--- a/v2/neural_qa/test/test_reader.py
+++ b/v2/neural_qa/test/test_reader.py
--- a/v2/neural_qa/test/trn_data.gz
+++ b/v2/neural_qa/test/trn_data.gz
--- a/v2/neural_qa/train.py
+++ b/v2/neural_qa/train.py
--- a/v2/neural_qa/utils.py
+++ b/v2/neural_qa/utils.py
--- a/v2/neural_qa/val_and_test.py
+++ b/v2/neural_qa/val_and_test.py
--- a/v2/nmt_without_attention/README.cn.md
+++ b/v2/nmt_without_attention/README.cn.md
--- a/v2/nmt_without_attention/README.md
+++ b/v2/nmt_without_attention/README.md
--- a/v2/nmt_without_attention/generate.py
+++ b/v2/nmt_without_attention/generate.py
--- a/v2/nmt_without_attention/images/bidirectional-encoder.png
+++ b/v2/nmt_without_attention/images/bidirectional-encoder.png
--- a/v2/nmt_without_attention/images/encoder-decoder.png
+++ b/v2/nmt_without_attention/images/encoder-decoder.png
--- a/v2/nmt_without_attention/images/gru.png
+++ b/v2/nmt_without_attention/images/gru.png
--- a/v2/nmt_without_attention/network_conf.py
+++ b/v2/nmt_without_attention/network_conf.py
--- a/v2/nmt_without_attention/train.py
+++ b/v2/nmt_without_attention/train.py
--- a/v2/scene_text_recognition/README.md
+++ b/v2/scene_text_recognition/README.md
--- a/v2/scene_text_recognition/config.py
+++ b/v2/scene_text_recognition/config.py
--- a/v2/scene_text_recognition/decoder.py
+++ b/v2/scene_text_recognition/decoder.py
--- a/v2/scene_text_recognition/images/503.jpg
+++ b/v2/scene_text_recognition/images/503.jpg
--- a/v2/scene_text_recognition/images/504.jpg
+++ b/v2/scene_text_recognition/images/504.jpg
--- a/v2/scene_text_recognition/images/505.jpg
+++ b/v2/scene_text_recognition/images/505.jpg
--- a/v2/scene_text_recognition/images/ctc.png
+++ b/v2/scene_text_recognition/images/ctc.png
--- a/v2/scene_text_recognition/images/feature_vector.png
+++ b/v2/scene_text_recognition/images/feature_vector.png
--- a/v2/scene_text_recognition/images/transcription.png
+++ b/v2/scene_text_recognition/images/transcription.png
--- a/v2/scene_text_recognition/infer.py
+++ b/v2/scene_text_recognition/infer.py
--- a/v2/scene_text_recognition/network_conf.py
+++ b/v2/scene_text_recognition/network_conf.py
--- a/v2/scene_text_recognition/reader.py
+++ b/v2/scene_text_recognition/reader.py
--- a/v2/scene_text_recognition/requirements.txt
+++ b/v2/scene_text_recognition/requirements.txt
--- a/v2/scene_text_recognition/train.py
+++ b/v2/scene_text_recognition/train.py
--- a/v2/scene_text_recognition/utils.py
+++ b/v2/scene_text_recognition/utils.py
--- a/v2/scheduled_sampling/README.md
+++ b/v2/scheduled_sampling/README.md
--- a/v2/scheduled_sampling/README_en.md
+++ b/v2/scheduled_sampling/README_en.md
--- a/v2/scheduled_sampling/generate.py
+++ b/v2/scheduled_sampling/generate.py
--- a/v2/scheduled_sampling/images/Scheduled_Sampling.jpg
+++ b/v2/scheduled_sampling/images/Scheduled_Sampling.jpg
--- a/v2/scheduled_sampling/images/decay.jpg
+++ b/v2/scheduled_sampling/images/decay.jpg
--- a/v2/scheduled_sampling/network_conf.py
+++ b/v2/scheduled_sampling/network_conf.py
--- a/v2/scheduled_sampling/reader.py
+++ b/v2/scheduled_sampling/reader.py
--- a/v2/scheduled_sampling/train.py
+++ b/v2/scheduled_sampling/train.py
--- a/v2/scheduled_sampling/utils.py
+++ b/v2/scheduled_sampling/utils.py
--- a/v2/sequence_tagging_for_ner/.gitignore
+++ b/v2/sequence_tagging_for_ner/.gitignore
--- a/v2/sequence_tagging_for_ner/README.md
+++ b/v2/sequence_tagging_for_ner/README.md
--- a/v2/sequence_tagging_for_ner/data/download.sh
+++ b/v2/sequence_tagging_for_ner/data/download.sh
--- a/v2/sequence_tagging_for_ner/data/target.txt
+++ b/v2/sequence_tagging_for_ner/data/target.txt
--- a/v2/sequence_tagging_for_ner/data/test
+++ b/v2/sequence_tagging_for_ner/data/test
--- a/v2/sequence_tagging_for_ner/data/train
+++ b/v2/sequence_tagging_for_ner/data/train
--- a/v2/sequence_tagging_for_ner/data/vocab.txt
+++ b/v2/sequence_tagging_for_ner/data/vocab.txt
--- a/v2/sequence_tagging_for_ner/images/BIO tag example.png
+++ b/v2/sequence_tagging_for_ner/images/BIO tag example.png
--- a/v2/sequence_tagging_for_ner/images/ner_label_ins.png
+++ b/v2/sequence_tagging_for_ner/images/ner_label_ins.png
--- a/v2/sequence_tagging_for_ner/images/ner_model_en.png
+++ b/v2/sequence_tagging_for_ner/images/ner_model_en.png
--- a/v2/sequence_tagging_for_ner/images/ner_network.png
+++ b/v2/sequence_tagging_for_ner/images/ner_network.png
--- a/v2/sequence_tagging_for_ner/infer.py
+++ b/v2/sequence_tagging_for_ner/infer.py
--- a/v2/sequence_tagging_for_ner/network_conf.py
+++ b/v2/sequence_tagging_for_ner/network_conf.py
--- a/v2/sequence_tagging_for_ner/reader.py
+++ b/v2/sequence_tagging_for_ner/reader.py
--- a/v2/sequence_tagging_for_ner/train.py
+++ b/v2/sequence_tagging_for_ner/train.py
--- a/v2/sequence_tagging_for_ner/utils.py
+++ b/v2/sequence_tagging_for_ner/utils.py
--- a/v2/ssd/README.cn.md
+++ b/v2/ssd/README.cn.md
--- a/v2/ssd/README.md
+++ b/v2/ssd/README.md
--- a/v2/ssd/config/__init__.py
+++ b/v2/ssd/config/__init__.py
--- a/v2/ssd/config/pascal_voc_conf.py
+++ b/v2/ssd/config/pascal_voc_conf.py
--- a/v2/ssd/data/label_list
+++ b/v2/ssd/data/label_list
--- a/v2/ssd/data/prepare_voc_data.py
+++ b/v2/ssd/data/prepare_voc_data.py
--- a/v2/ssd/data_provider.py
+++ b/v2/ssd/data_provider.py
--- a/v2/ssd/eval.py
+++ b/v2/ssd/eval.py
--- a/v2/ssd/image_util.py
+++ b/v2/ssd/image_util.py
--- a/v2/ssd/images/SSD300x300_map.png
+++ b/v2/ssd/images/SSD300x300_map.png
--- a/v2/ssd/images/ssd_network.png
+++ b/v2/ssd/images/ssd_network.png
--- a/v2/ssd/images/vis_1.jpg
+++ b/v2/ssd/images/vis_1.jpg
--- a/v2/ssd/images/vis_2.jpg
+++ b/v2/ssd/images/vis_2.jpg
--- a/v2/ssd/images/vis_3.jpg
+++ b/v2/ssd/images/vis_3.jpg
--- a/v2/ssd/images/vis_4.jpg
+++ b/v2/ssd/images/vis_4.jpg
--- a/v2/ssd/infer.py
+++ b/v2/ssd/infer.py
--- a/v2/ssd/train.py
+++ b/v2/ssd/train.py
--- a/v2/ssd/vgg_ssd_net.py
+++ b/v2/ssd/vgg_ssd_net.py
--- a/v2/ssd/visual.py
+++ b/v2/ssd/visual.py
--- a/v2/text_classification/.gitignore
+++ b/v2/text_classification/.gitignore
--- a/v2/text_classification/README.md
+++ b/v2/text_classification/README.md
--- a/v2/text_classification/images/cnn_net.png
+++ b/v2/text_classification/images/cnn_net.png
--- a/v2/text_classification/images/dnn_net.png
+++ b/v2/text_classification/images/dnn_net.png
--- a/v2/text_classification/infer.py
+++ b/v2/text_classification/infer.py
--- a/v2/text_classification/network_conf.py
+++ b/v2/text_classification/network_conf.py
--- a/v2/text_classification/reader.py
+++ b/v2/text_classification/reader.py
--- a/v2/text_classification/run.sh
+++ b/v2/text_classification/run.sh
--- a/v2/text_classification/train.py
+++ b/v2/text_classification/train.py
--- a/v2/text_classification/utils.py
+++ b/v2/text_classification/utils.py
--- a/v2/youtube_recall/README.cn.md
+++ b/v2/youtube_recall/README.cn.md
--- a/v2/youtube_recall/README.md
+++ b/v2/youtube_recall/README.md
--- a/v2/youtube_recall/data/data.tar
+++ b/v2/youtube_recall/data/data.tar
--- a/v2/youtube_recall/data_processor.py
+++ b/v2/youtube_recall/data_processor.py
--- a/v2/youtube_recall/images/model_network.png
+++ b/v2/youtube_recall/images/model_network.png
--- a/v2/youtube_recall/images/recommendation_system.png
+++ b/v2/youtube_recall/images/recommendation_system.png
--- a/v2/youtube_recall/infer.py
+++ b/v2/youtube_recall/infer.py
--- a/v2/youtube_recall/infer_user.py
+++ b/v2/youtube_recall/infer_user.py
--- a/v2/youtube_recall/item_vector.py
+++ b/v2/youtube_recall/item_vector.py
--- a/v2/youtube_recall/network_conf.py
+++ b/v2/youtube_recall/network_conf.py
--- a/v2/youtube_recall/reader.py
+++ b/v2/youtube_recall/reader.py
--- a/v2/youtube_recall/train.py
+++ b/v2/youtube_recall/train.py
--- a/v2/youtube_recall/user_vector.py
+++ b/v2/youtube_recall/user_vector.py
--- a/v2/youtube_recall/utils.py
+++ b/v2/youtube_recall/utils.py