diff --git a/README.md b/README.md
index 3ded5b78bf7b038eb7c1efd27352ad14b822e1f8..1c37b89256b4b74b5bae6abed1ec5587cbee4ebb 100644
--- a/README.md
+++ b/README.md
@@ -8,15 +8,11 @@ PaddlePaddle provides a rich set of computational units to enable users to adopt
 
 - [fluid models](fluid): use PaddlePaddle's Fluid APIs. We especially recommend users to use Fluid models.
 
-- [legacy models](legacy): use PaddlePaddle's v2 APIs.
-
 
 PaddlePaddle 提供了丰富的计算单元，使得用户可以采用模块化的方法解决各种学习问题。在此repo中，我们展示了如何用 PaddlePaddle 来解决常见的机器学习任务，提供若干种不同的易学易用的神经网络模型。
 
 - [fluid模型](fluid): 使用 PaddlePaddle Fluid版本的 APIs，我们特别推荐您使用Fluid模型。
 
-- [legacy模型](legacy): 使用 PaddlePaddle v2版本的 APIs。
-
 
 ## License
 This tutorial is contributed by [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) and licensed under the [Apache-2.0 license](LICENSE).
diff --git a/fluid/PaddleCV/deeplabv3+/.run_ce.sh b/fluid/PaddleCV/deeplabv3+/.run_ce.sh
new file mode 100755
index 0000000000000000000000000000000000000000..540fb964ba94fd29dc28bb51342cdba839d433e7
--- /dev/null
+++ b/fluid/PaddleCV/deeplabv3+/.run_ce.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+DATASET_PATH=${HOME}/.cache/paddle/dataset/cityscape/
+
+cudaid=${deeplabv3plus:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py \
+--batch_size=2 \
+--train_crop_size=769 \
+--total_step=50 \
+--save_weights_path=output1 \
+--dataset_path=$DATASET_PATH \
+--enable_ce | python _ce.py
+
+cudaid=${deeplabv3plus_m:=0,1,2,3} # use 0,1,2,3 card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py \
+--batch_size=2 \
+--train_crop_size=769 \
+--total_step=50 \
+--save_weights_path=output4 \
+--dataset_path=$DATASET_PATH \
+--enable_ce | python _ce.py
diff --git a/fluid/PaddleCV/deeplabv3+/__init__.py b/fluid/PaddleCV/deeplabv3+/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fluid/PaddleCV/deeplabv3+/_ce.py b/fluid/PaddleCV/deeplabv3+/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0127d6445213b9d3934220fa36e9eb44d3e04b4
--- /dev/null
+++ b/fluid/PaddleCV/deeplabv3+/_ce.py
@@ -0,0 +1,60 @@
+# this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi
+from kpi import DurationKpi
+
+each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.1, 0, actived=True)
+train_loss_card1_kpi = CostKpi('train_loss_card1', 0.05, 0)
+each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.1, 0, actived=True)
+train_loss_card4_kpi = CostKpi('train_loss_card4', 0.05, 0)
+
+tracking_kpis = [
+        each_pass_duration_card1_kpi,
+        train_loss_card1_kpi,
+        each_pass_duration_card4_kpi,
+        train_loss_card4_kpi,
+        ]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
diff --git a/fluid/PaddleCV/deeplabv3+/train.py b/fluid/PaddleCV/deeplabv3+/train.py
old mode 100644
new mode 100755
index 817d53d173467f9146918ec9bb6b44141eb0ac3f..e009f76e0e16be9e4a5db532615cefac258fada1
--- a/fluid/PaddleCV/deeplabv3+/train.py
+++ b/fluid/PaddleCV/deeplabv3+/train.py
@@ -34,6 +34,10 @@ def add_arguments():
     add_argument('parallel', bool, False, "using ParallelExecutor.")
     add_argument('use_gpu', bool, True, "Whether use GPU or CPU.")
     add_argument('num_classes', int, 19, "Number of classes.")
+    parser.add_argument(
+        '--enable_ce',
+        action='store_true',
+        help='If set, run the task with continuous evaluation logs.')
 
 
 def load_model():
@@ -51,7 +55,10 @@ def load_model():
     else:
         if args.num_classes == 19:
             fluid.io.load_params(
-                exe, dirname=args.init_weights_path, main_program=tp)
+                exe,
+                dirname="",
+                filename=args.init_weights_path,
+                main_program=tp)
         else:
             fluid.io.load_vars(
                 exe, dirname="", filename=args.init_weights_path, vars=myvars)
@@ -84,6 +91,15 @@ def loss(logit, label):
     return loss, label_nignore
 
 
+def get_cards(args):
+    if args.enable_ce:
+        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
+        num = len(cards.split(","))
+        return num
+    else:
+        return args.num_devices
+
+
 CityscapeDataset = reader.CityscapeDataset
 parser = argparse.ArgumentParser()
 
@@ -99,6 +115,13 @@ deeplabv3p = models.deeplabv3p
 
 sp = fluid.Program()
 tp = fluid.Program()
+
+# only for ce
+if args.enable_ce:
+    SEED = 102
+    sp.random_seed = SEED
+    tp.random_seed = SEED
+
 crop_size = args.train_crop_size
 batch_size = args.batch_size
 image_shape = [crop_size, crop_size]
@@ -155,7 +178,13 @@ if args.parallel:
 
 batches = dataset.get_batch_generator(batch_size, total_step)
 
+total_time = 0.0
+epoch_idx = 0
+train_loss = 0
+
 for i, imgs, labels, names in batches:
+    epoch_idx += 1
+    begin_time = time.time()
     prev_start_time = time.time()
     if args.parallel:
         retv = exe_p.run(fetch_list=[pred.name, loss_mean.name],
@@ -167,11 +196,21 @@ for i, imgs, labels, names in batches:
                              'label': labels},
                        fetch_list=[pred, loss_mean])
     end_time = time.time()
+    total_time += end_time - begin_time
     if i % 100 == 0:
         print("Model is saved to", args.save_weights_path)
         save_model()
     print("step {:d}, loss: {:.6f}, step_time_cost: {:.3f}".format(
         i, np.mean(retv[1]), end_time - prev_start_time))
 
+    # only for ce
+    train_loss = np.mean(retv[1])
+
+if args.enable_ce:
+    gpu_num = get_cards(args)
+    print("kpis\teach_pass_duration_card%s\t%s" %
+          (gpu_num, total_time / epoch_idx))
+    print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, train_loss))
+
 print("Training done. Model is saved to", args.save_weights_path)
 save_model()
diff --git a/fluid/PaddleCV/face_detection/.run_ce.sh b/fluid/PaddleCV/face_detection/.run_ce.sh
new file mode 100755
index 0000000000000000000000000000000000000000..0b8632516b06b9ea48691a098b7ac25b171decd5
--- /dev/null
+++ b/fluid/PaddleCV/face_detection/.run_ce.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+
+cudaid=${face_detection:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py --batch_size=2 --epoc_num=1 --batch_num=200 --parallel=False --enable_ce | python _ce.py
+
+
+cudaid=${face_detection_m:=0,1,2,3} # use 0,1,2,3 card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py --batch_size=8 --epoc_num=1 --batch_num=200 --parallel=False --enable_ce | python _ce.py
+
diff --git a/fluid/PaddleCV/face_detection/__init__.py b/fluid/PaddleCV/face_detection/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fluid/PaddleCV/face_detection/_ce.py b/fluid/PaddleCV/face_detection/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d8325dca6d8f7a08c8f7f0b734d2643b3c550b1
--- /dev/null
+++ b/fluid/PaddleCV/face_detection/_ce.py
@@ -0,0 +1,65 @@
+# this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi
+from kpi import DurationKpi
+
+
+each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
+train_face_loss_card1_kpi = CostKpi('train_face_loss_card1', 0.08, 0)
+train_head_loss_card1_kpi = CostKpi('train_head_loss_card1', 0.08, 0)
+each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True)
+train_face_loss_card4_kpi = CostKpi('train_face_loss_card4', 0.08, 0)
+train_head_loss_card4_kpi = CostKpi('train_head_loss_card4', 0.08, 0)
+
+tracking_kpis = [
+        each_pass_duration_card1_kpi,
+        train_face_loss_card1_kpi,
+        train_head_loss_card1_kpi,
+        each_pass_duration_card4_kpi,
+        train_face_loss_card4_kpi,
+        train_head_loss_card4_kpi,
+        ]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
diff --git a/fluid/PaddleCV/face_detection/train.py b/fluid/PaddleCV/face_detection/train.py
index 67cec03b95ba5ffe1a5230c287bd12a49b90bb34..71caab9702762cc7f823e6be3f22c9ed278ca364 100644
--- a/fluid/PaddleCV/face_detection/train.py
+++ b/fluid/PaddleCV/face_detection/train.py
@@ -32,6 +32,9 @@ add_arg('mean_BGR',         str,   '104., 117., 123.', "Mean value for B,G,R cha
 add_arg('with_mem_opt',     bool,  True,            "Whether to use memory optimization or not.")
 add_arg('pretrained_model', str,   './vgg_ilsvrc_16_fc_reduced/', "The init model path.")
 add_arg('data_dir',         str,   'data',          "The base dir of dataset")
+parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
+parser.add_argument('--batch_num', type=int, help="batch num for ce")
+parser.add_argument('--num_devices', type=int, default=1, help='Number of GPU devices')
 #yapf: enable
 
 train_parameters = {
@@ -119,6 +122,16 @@ def train(args, config, train_params, train_file_list):
     startup_prog = fluid.Program()
     train_prog = fluid.Program()
 
+    #only for ce
+    if args.enable_ce:
+        SEED = 102
+        startup_prog.random_seed = SEED
+        train_prog.random_seed = SEED
+        num_workers = 1
+        pretrained_model = ""
+        if args.batch_num != None:
+            iters_per_epoc = args.batch_num
+
     train_py_reader, fetches, loss = build_program(
         train_params = train_params,
         main_prog = train_prog,
@@ -171,7 +184,12 @@ def train(args, config, train_params, train_file_list):
 
     train_py_reader.start()
     try:
+        total_time = 0.0
+        epoch_idx = 0
+        face_loss = 0
+        head_loss = 0
         for pass_id in range(start_epoc, epoc_num):
+            epoch_idx += 1
             start_time = time.time()
             prev_start_time = start_time
             end_time = 0
@@ -198,14 +216,38 @@ def train(args, config, train_params, train_file_list):
                               "time {:.5f}".format(pass_id,
                                batch_id, fetch_vars[0], fetch_vars[1],
                                start_time - prev_start_time))
+                face_loss = fetch_vars[0]
+                head_loss = fetch_vars[1]
+            epoch_end_time = time.time()
+            total_time += epoch_end_time - start_time
             if pass_id % 1 == 0 or pass_id == epoc_num - 1:
                 save_model(str(pass_id), train_prog)
+        # only for ce
+        if args.enable_ce:
+            gpu_num = get_cards(args)
+            print("kpis\teach_pass_duration_card%s\t%s" %
+                    (gpu_num, total_time / epoch_idx))
+            print("kpis\ttrain_face_loss_card%s\t%s" %
+                    (gpu_num, face_loss))
+            print("kpis\ttrain_head_loss_card%s\t%s" %
+                    (gpu_num, head_loss))
+
     except fluid.core.EOFException:
         train_py_reader.reset()
     except StopIteration:
         train_py_reader.reset()
     train_py_reader.reset()
 
+
+def get_cards(args):
+    if args.enable_ce:
+        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
+        num = len(cards.split(","))
+        return num
+    else:
+        return args.num_devices
+
+
 if __name__ == '__main__':
     args = parser.parse_args()
     print_arguments(args)
diff --git a/fluid/PaddleCV/faster_rcnn/.run_ce.sh b/fluid/PaddleCV/faster_rcnn/.run_ce.sh
new file mode 100755
index 0000000000000000000000000000000000000000..af2f21da73fa14589258b565a74f78e25dbd4e84
--- /dev/null
+++ b/fluid/PaddleCV/faster_rcnn/.run_ce.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+
+cudaid=${face_detection:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py --model_save_dir=output/ --data_dir=dataset/coco/ --max_iter=10 --enable_ce --pretrained_model=./imagenet_resnet50_fusebn | python _ce.py
+
+
+cudaid=${face_detection_m:=0,1,2,3} # use 0,1,2,3 card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py --model_save_dir=output/ --data_dir=dataset/coco/ --max_iter=10 --enable_ce --pretrained_model=./imagenet_resnet50_fusebn | python _ce.py
+
diff --git a/fluid/PaddleCV/faster_rcnn/__init__.py b/fluid/PaddleCV/faster_rcnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fluid/PaddleCV/faster_rcnn/_ce.py b/fluid/PaddleCV/faster_rcnn/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d5850fd22c3d023eb866fa474b6f6f586ca326e
--- /dev/null
+++ b/fluid/PaddleCV/faster_rcnn/_ce.py
@@ -0,0 +1,61 @@
+# this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi
+from kpi import DurationKpi
+
+
+each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
+train_loss_card1_kpi = CostKpi('train_loss_card1', 0.08, 0)
+each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True)
+train_loss_card4_kpi = CostKpi('train_loss_card4', 0.08, 0)
+
+tracking_kpis = [
+        each_pass_duration_card1_kpi,
+        train_loss_card1_kpi,
+        each_pass_duration_card4_kpi,
+        train_loss_card4_kpi,
+        ]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
diff --git a/fluid/PaddleCV/faster_rcnn/train.py b/fluid/PaddleCV/faster_rcnn/train.py
index 1b18f85d8b18c74809bac5a8c7c2b4b0d5e0e232..b840d2855c09e1df91601d30df1503a6003aeef5 100644
--- a/fluid/PaddleCV/faster_rcnn/train.py
+++ b/fluid/PaddleCV/faster_rcnn/train.py
@@ -35,7 +35,7 @@ def train():
     learning_rate = cfg.learning_rate
     image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]
 
-    if cfg.debug:
+    if cfg.debug or cfg.enable_ce:
         fluid.default_startup_program().random_seed = 1000
         fluid.default_main_program().random_seed = 1000
         import random
@@ -46,11 +46,14 @@ def train():
     devices_num = len(devices.split(","))
     total_batch_size = devices_num * cfg.TRAIN.im_per_batch
 
+    use_random = True
+    if cfg.enable_ce:
+        use_random = False
     model = model_builder.FasterRCNN(
         add_conv_body_func=resnet.add_ResNet50_conv4_body,
         add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
         use_pyreader=cfg.use_pyreader,
-        use_random=True)
+        use_random=use_random)
     model.build_model(image_shape)
     loss_cls, loss_bbox, rpn_cls_loss, rpn_reg_loss = model.loss()
     loss_cls.persistable = True
@@ -92,16 +95,19 @@ def train():
         train_exe = fluid.ParallelExecutor(
             use_cuda=bool(cfg.use_gpu), loss_name=loss.name)
 
+    shuffle = True
+    if cfg.enable_ce:
+        shuffle = False
     if cfg.use_pyreader:
         train_reader = reader.train(
             batch_size=cfg.TRAIN.im_per_batch,
             total_batch_size=total_batch_size,
             padding_total=cfg.TRAIN.padding_minibatch,
-            shuffle=True)
+            shuffle=shuffle)
         py_reader = model.py_reader
         py_reader.decorate_paddle_reader(train_reader)
     else:
-        train_reader = reader.train(batch_size=total_batch_size, shuffle=True)
+        train_reader = reader.train(batch_size=total_batch_size, shuffle=shuffle)
         feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())
 
     def save_model(postfix):
@@ -118,6 +124,8 @@ def train():
         try:
             start_time = time.time()
             prev_start_time = start_time
+            total_time = 0
+            last_loss = 0
             every_pass_loss = []
             for iter_id in range(cfg.max_iter):
                 prev_start_time = start_time
@@ -131,9 +139,23 @@ def train():
                     iter_id, lr[0],
                     smoothed_loss.get_median_value(
                     ), start_time - prev_start_time))
+                end_time = time.time()
+                total_time += end_time - start_time
+                last_loss = np.mean(np.array(losses[0]))
+
                 sys.stdout.flush()
                 if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                     save_model("model_iter{}".format(iter_id))
+            # only for ce
+            if cfg.enable_ce:
+                gpu_num = devices_num
+                epoch_idx = iter_id + 1
+                loss = last_loss
+                print("kpis\teach_pass_duration_card%s\t%s" %
+                        (gpu_num, total_time / epoch_idx))
+                print("kpis\ttrain_loss_card%s\t%s" %
+                        (gpu_num, loss))
+
         except fluid.core.EOFException:
             py_reader.reset()
         return np.mean(every_pass_loss)
@@ -142,6 +164,8 @@ def train():
         start_time = time.time()
         prev_start_time = start_time
         start = start_time
+        total_time = 0
+        last_loss = 0
         every_pass_loss = []
         smoothed_loss = SmoothedValue(cfg.log_window)
         for iter_id, data in enumerate(train_reader()):
@@ -154,6 +178,9 @@ def train():
             smoothed_loss.add_value(loss_v)
             lr = np.array(fluid.global_scope().find_var('learning_rate')
                           .get_tensor())
+            end_time = time.time()
+            total_time += end_time - start_time
+            last_loss = loss_v
             print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format(
                 iter_id, lr[0],
                 smoothed_loss.get_median_value(), start_time - prev_start_time))
@@ -162,6 +189,16 @@ def train():
                 save_model("model_iter{}".format(iter_id))
             if (iter_id + 1) == cfg.max_iter:
                 break
+        # only for ce
+        if cfg.enable_ce:
+            gpu_num = devices_num
+            epoch_idx = iter_id + 1
+            loss = last_loss
+            print("kpis\teach_pass_duration_card%s\t%s" %
+                    (gpu_num, total_time / epoch_idx))
+            print("kpis\ttrain_loss_card%s\t%s" %
+                    (gpu_num, loss))
+
         return np.mean(every_pass_loss)
 
     if cfg.use_pyreader:
diff --git a/fluid/PaddleCV/faster_rcnn/utility.py b/fluid/PaddleCV/faster_rcnn/utility.py
index 12a208823482a6904e4f0ee0dcae84fa38f7cf37..3687cc76bc21e00ccfdfbaa52270412df4586f9c 100644
--- a/fluid/PaddleCV/faster_rcnn/utility.py
+++ b/fluid/PaddleCV/faster_rcnn/utility.py
@@ -129,6 +129,9 @@ def parse_args():
     add_arg('draw_threshold',  float, 0.8,    "Confidence threshold to draw bbox.")
     add_arg('image_path',       str,   'data/COCO17/val2017',  "The image path used to inference and visualize.")
     add_arg('image_name',        str,    '',       "The single image used to inference and visualize.")
+    # ce
+    parser.add_argument(
+            '--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
     # yapf: enable
     args = parser.parse_args()
     file_name = sys.argv[0]
diff --git a/fluid/PaddleCV/gan/c_gan/c_gan.py b/fluid/PaddleCV/gan/c_gan/c_gan.py
index 18c6e5df232d5077126001b0fe17ca098c8e6c4b..ebf5f87fda33375e045022aca860e81b752ffaf5 100644
--- a/fluid/PaddleCV/gan/c_gan/c_gan.py
+++ b/fluid/PaddleCV/gan/c_gan/c_gan.py
@@ -165,7 +165,8 @@ def train(args):
                           'conditions': conditions_data},
                     fetch_list={dg_loss})[0][0]
                 losses[1].append(dg_loss_n)
-            t_time += (time.time() - s_time)
+            batch_time = time.time() - s_time
+            t_time += batch_time
 
             
 
@@ -180,8 +181,9 @@ def train(args):
                     fetch_list={g_img})[0]
                 total_images = np.concatenate([real_image, generated_images])
                 fig = plot(total_images)
-                msg = "Epoch ID={0}\n Batch ID={1}\n D-Loss={2}\n DG-Loss={3}\n gen={4}".format(
-                    pass_id, batch_id, d_loss_n, dg_loss_n, check(generated_images))
+                msg = "Epoch ID={0}\n Batch ID={1}\n D-Loss={2}\n DG-Loss={3}\n gen={4}\n " \
+                      "Batch_time_cost={5:.2f}".format(
+                    pass_id, batch_id, d_loss_n, dg_loss_n, check(generated_images), batch_time)
                 print(msg)
                 plt.title(msg)
                 plt.savefig(
diff --git a/fluid/PaddleCV/gan/cycle_gan/train.py b/fluid/PaddleCV/gan/cycle_gan/train.py
index 1cc2fa090b3c35d61071f7ce1b7caedbd18226f9..ea7887570f8e4063bb036d67ecf58c45902fd3f2 100644
--- a/fluid/PaddleCV/gan/cycle_gan/train.py
+++ b/fluid/PaddleCV/gan/cycle_gan/train.py
@@ -187,10 +187,12 @@ def train(args):
                 fetch_list=[d_A_trainer.d_loss_A],
                 feed={"input_A": tensor_A,
                       "fake_pool_A": fake_pool_A})[0]
-            t_time += (time.time() - s_time)
-            print("epoch{}; batch{}; g_A_loss: {}; d_B_loss: {}; g_B_loss: {}; d_A_loss: {};".format(
+            batch_time = time.time() - s_time
+            t_time += batch_time
+            print("epoch{}; batch{}; g_A_loss: {}; d_B_loss: {}; g_B_loss: {}; d_A_loss: {}; "
+                  "Batch_time_cost: {:.2f}".format(
                 epoch, batch_id, g_A_loss[0], d_B_loss[0], g_B_loss[0],
-                d_A_loss[0]))
+                d_A_loss[0], batch_time))
             losses[0].append(g_A_loss[0])
             losses[1].append(d_A_loss[0])
             sys.stdout.flush()
diff --git a/fluid/PaddleCV/image_classification/.run_ce.sh b/fluid/PaddleCV/image_classification/.run_ce.sh
index 9ba9a4c2c6779694f0e87e12ca85b59afa33f1c0..cc0d894a634bc0add12fd83840990eacf77382cc 100755
--- a/fluid/PaddleCV/image_classification/.run_ce.sh
+++ b/fluid/PaddleCV/image_classification/.run_ce.sh
@@ -7,6 +7,7 @@ cudaid=${object_detection_cudaid:=0}
 export CUDA_VISIBLE_DEVICES=$cudaid
 python train.py --batch_size=${BATCH_SIZE} --num_epochs=5 --enable_ce=True --lr_strategy=cosine_decay | python _ce.py
 
+BATCH_SIZE=224
 cudaid=${object_detection_cudaid_m:=0, 1, 2, 3}
 export CUDA_VISIBLE_DEVICES=$cudaid
 python train.py --batch_size=${BATCH_SIZE} --num_epochs=5 --enable_ce=True --lr_strategy=cosine_decay | python _ce.py
diff --git a/fluid/PaddleCV/image_classification/train.py b/fluid/PaddleCV/image_classification/train.py
index ba3c94d129965684e17f692faf653ffa15984371..28652286fbdef1c1d572c6150868042be30a2138 100644
--- a/fluid/PaddleCV/image_classification/train.py
+++ b/fluid/PaddleCV/image_classification/train.py
@@ -242,7 +242,7 @@ def train(args):
         device_num = subprocess.check_output(['nvidia-smi', '-L']).decode().count('\n')
 
     train_batch_size = args.batch_size / device_num
-    test_batch_size = 8
+    test_batch_size = 16
     if not args.enable_ce:
         train_reader = paddle.batch(
             reader.train(), batch_size=train_batch_size, drop_last=True)
@@ -306,7 +306,7 @@ def train(args):
         train_loss = np.array(train_info[0]).mean()
         train_acc1 = np.array(train_info[1]).mean()
         train_acc5 = np.array(train_info[2]).mean()
-        train_speed = np.array(train_time).mean() / train_batch_size
+        train_speed = np.array(train_time).mean() / (train_batch_size * device_num)
 
         test_py_reader.start()
 
diff --git a/fluid/PaddleCV/object_detection/data_util.py b/fluid/PaddleCV/object_detection/data_util.py
index ac022593119e0008c3f7f3858303cbf5bc717650..e7d6b2b43eee5048fb5d3d8397a3e88aa0f14b49 100644
--- a/fluid/PaddleCV/object_detection/data_util.py
+++ b/fluid/PaddleCV/object_detection/data_util.py
@@ -68,6 +68,7 @@ class GeneratorEnqueuer(object):
                         try:
                             task()
                         except Exception:
+                            traceback.print_exc()
                             self._stop_event.set()
                             break
             else:
@@ -75,6 +76,7 @@ class GeneratorEnqueuer(object):
                     try:
                         task()
                     except Exception:
+                        traceback.print_exc()
                         self._stop_event.set()
                         break
 
diff --git a/fluid/PaddleCV/object_detection/reader.py b/fluid/PaddleCV/object_detection/reader.py
index 59da1b38fb2e9cce8bb99a2773e7fc222ee33bd8..6acc18594e5979308a7ba641002569b0867516a8 100644
--- a/fluid/PaddleCV/object_detection/reader.py
+++ b/fluid/PaddleCV/object_detection/reader.py
@@ -176,10 +176,17 @@ def coco(settings, file_list, mode, batch_size, shuffle):
         if mode == 'train' and shuffle:
             np.random.shuffle(images)
         batch_out = []
+        if '2014' in file_list:
+            sub_dir = "train2014" if model == "train" else "val2014"
+        elif '2017' in file_list:
+            sub_dir = "train2017" if mode == "train" else "val2017"
+        data_dir = os.path.join(settings.data_dir, sub_dir)
         for image in images:
             image_name = image['file_name']
-            image_path = os.path.join(settings.data_dir, image_name)
-
+            image_path = os.path.join(data_dir, image_name)
+            if not os.path.exists(image_path):
+                raise ValueError("%s is not exist, you should specify "
+                                 "data path correctly." % image_path)
             im = Image.open(image_path)
             if im.mode == 'L':
                 im = im.convert('RGB')
@@ -242,7 +249,9 @@ def pascalvoc(settings, file_list, mode, batch_size, shuffle):
             image_path, label_path = image.split()
             image_path = os.path.join(settings.data_dir, image_path)
             label_path = os.path.join(settings.data_dir, label_path)
-
+            if not os.path.exists(image_path):
+                raise ValueError("%s is not exist, you should specify "
+                                 "data path correctly." % image_path)
             im = Image.open(image_path)
             if im.mode == 'L':
                 im = im.convert('RGB')
@@ -295,7 +304,6 @@ def train(settings,
           max_queue=24,
           enable_ce=False):
     file_list = os.path.join(settings.data_dir, file_list)
-
     if 'coco' in settings.dataset:
         generator = coco(settings, file_list, "train", batch_size, shuffle)
     else:
@@ -341,6 +349,9 @@ def test(settings, file_list, batch_size):
 
 def infer(settings, image_path):
     def reader():
+        if not os.path.exists(image_path):
+            raise ValueError("%s is not exist, you should specify "
+                             "data path correctly." % image_path)
         img = Image.open(image_path)
         if img.mode == 'L':
             img = im.convert('RGB')
diff --git a/fluid/PaddleNLP/deep_attention_matching_net/_ce.py b/fluid/PaddleNLP/deep_attention_matching_net/_ce.py
index 0c38c0a3d1b0fc0a240a7bae928d9c07f8b95886..7ad30288074da3124c33fad6c96fd369a812c77c 100644
--- a/fluid/PaddleNLP/deep_attention_matching_net/_ce.py
+++ b/fluid/PaddleNLP/deep_attention_matching_net/_ce.py
@@ -7,8 +7,8 @@ from kpi import CostKpi, DurationKpi, AccKpi
 
 #### NOTE kpi.py should shared in models in some way!!!!
 
-train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
-train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True)
+train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
+train_duration_kpi = DurationKpi('train_duration', 0.05, 0, actived=True)
 
 tracking_kpis = [
     train_cost_kpi,
diff --git a/fluid/PaddleNLP/deep_attention_matching_net/train_and_evaluate.py b/fluid/PaddleNLP/deep_attention_matching_net/train_and_evaluate.py
index f240615b59376e8d86ce2ebaddd8eae8ee15fe30..725c63b8e8eeeef6b6fdb76039c47cae7729f80d 100644
--- a/fluid/PaddleNLP/deep_attention_matching_net/train_and_evaluate.py
+++ b/fluid/PaddleNLP/deep_attention_matching_net/train_and_evaluate.py
@@ -390,6 +390,8 @@ def train(args):
         else:
             global_step, last_cost = train_with_feed(global_step)
         train_time += time.time() - begin_time
+        print("Pass {0}, pass_time_cost {1}"
+          .format(epoch, "%2.2f sec" % (time.time() - begin_time)))
     # For internal continuous evaluation
     if "CE_MODE_X" in os.environ:
         print("kpis	train_cost	%f" % last_cost)
diff --git a/fluid/PaddleNLP/machine_reading_comprehension/_ce.py b/fluid/PaddleNLP/machine_reading_comprehension/_ce.py
index cff13c8722007987a3cd82f1298206248963e45a..a425fe951fb587749f31b18959917cdeed76a41d 100644
--- a/fluid/PaddleNLP/machine_reading_comprehension/_ce.py
+++ b/fluid/PaddleNLP/machine_reading_comprehension/_ce.py
@@ -3,6 +3,7 @@
 import os
 import sys
 #sys.path.insert(0, os.environ['ceroot'])
+sys.path.append(os.environ['ceroot'])
 from kpi import CostKpi, DurationKpi, AccKpi
 
 #### NOTE kpi.py should shared in models in some way!!!!
diff --git a/fluid/PaddleNLP/machine_reading_comprehension/run.py b/fluid/PaddleNLP/machine_reading_comprehension/run.py
index 74561297f003faa4b3d871c0f327b65da63e81e7..884549d106af7f44789728fb488b5e60e149e118 100644
--- a/fluid/PaddleNLP/machine_reading_comprehension/run.py
+++ b/fluid/PaddleNLP/machine_reading_comprehension/run.py
@@ -446,7 +446,9 @@ def train(logger, args):
                             logger.info('Dev eval result: {}'.format(
                                 bleu_rouge))
                 pass_end_time = time.time()
-
+                time_consumed = pass_end_time - pass_start_time
+                logger.info('epoch: {0}, epoch_time_cost: {1:.2f}'.format(
+                    pass_id, time_consumed))
                 logger.info('Evaluating the model after epoch {}'.format(
                     pass_id))
                 if brc_data.dev_set is not None:
@@ -459,7 +461,7 @@ def train(logger, args):
                 else:
                     logger.warning(
                         'No dev set is loaded for evaluation in the dataset!')
-                time_consumed = pass_end_time - pass_start_time
+
                 logger.info('Average train loss for epoch {} is {}'.format(
                     pass_id, "%.10f" % (1.0 * total_loss / total_num)))
 
diff --git a/fluid/PaddleNLP/text_classification/train.py b/fluid/PaddleNLP/text_classification/train.py
index 159266f3956b950afa200e9f53c9fdc6c36309aa..174636f06ec5fe07180347745f910166140e9eed 100644
--- a/fluid/PaddleNLP/text_classification/train.py
+++ b/fluid/PaddleNLP/text_classification/train.py
@@ -89,7 +89,7 @@ def train(train_reader,
 
 def train_net():
     word_dict, train_reader, test_reader = utils.prepare_data(
-        "imdb", self_dict=False, batch_size=4, buf_size=50000)
+        "imdb", self_dict=False, batch_size=128, buf_size=50000)
 
     if sys.argv[1] == "bow":
         train(
diff --git a/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh b/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh
old mode 100644
new mode 100755
index eca247a40a3f680a6a59c4a183bfba006ced8d44..f1bb7febd3f2c572544612baf24be14c711108e3
--- a/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh
+++ b/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh
@@ -6,9 +6,9 @@ export OMP_NUM_THREADS=1
 cudaid=${text_matching_on_quora:=0} # use 0-th card as default
 export CUDA_VISIBLE_DEVICES=$cudaid
 
-FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py
+FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce --epoch_num=5 | python _ce.py
 
 cudaid=${text_matching_on_quora_m:=0,1,2,3} # use 0,1,2,3 card as default
 export CUDA_VISIBLE_DEVICES=$cudaid
 
-FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py
+FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce --epoch_num=5 | python _ce.py
diff --git a/fluid/PaddleNLP/text_matching_on_quora/_ce.py b/fluid/PaddleNLP/text_matching_on_quora/_ce.py
index b38ad21a1e0eb7407f78d100a3cb3659f6c5d8d3..eadeb821da6f7049d1916a65a1ae4eb995c5cb6d 100644
--- a/fluid/PaddleNLP/text_matching_on_quora/_ce.py
+++ b/fluid/PaddleNLP/text_matching_on_quora/_ce.py
@@ -7,11 +7,11 @@ from kpi import CostKpi
 from kpi import DurationKpi
 
 
-each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.05, 0, actived=True)
-train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.2, 0)
+each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
+train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.08, 0)
 train_avg_acc_card1_kpi = CostKpi('train_avg_acc_card1', 0.02, 0)
-each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.05, 0, actived=True)
-train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.2, 0)
+each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True)
+train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.08, 0)
 train_avg_acc_card4_kpi = CostKpi('train_avg_acc_card4', 0.02, 0)
 
 tracking_kpis = [
diff --git a/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py b/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py
index 714fa6f970d9f213efdc6b6e1799b244696fb20d..0f88c6b6ef13aec25e08527b7efabe8638a3af25 100755
--- a/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py
+++ b/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py
@@ -34,6 +34,7 @@ parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument('--model_name',       type=str,   default='cdssmNet',                  help="Which model to train")
 parser.add_argument('--config',           type=str,   default='cdssm_base',       help="The global config setting")
 parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
+parser.add_argument('--epoch_num', type=int, help='Number of epoch')
 
 DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset')
 
@@ -241,6 +242,9 @@ def main():
     args = parser.parse_args()
     global_config = configs.__dict__[args.config]()
 
+    if args.epoch_num != None:
+        global_config.epoch_num = args.epoch_num
+
     print("net_name: ", args.model_name)
     net = models.__dict__[args.model_name](global_config)
 
diff --git a/fluid/PaddleRec/multiview_simnet/nets.py b/fluid/PaddleRec/multiview_simnet/nets.py
index 41e366f55c80c5151102ed5e81a2746774fb3b4b..fed177844bdd247d163aee9e8625cd0ec74378b3 100644
--- a/fluid/PaddleRec/multiview_simnet/nets.py
+++ b/fluid/PaddleRec/multiview_simnet/nets.py
@@ -33,7 +33,7 @@ class CNNEncoder(object):
     """ cnn-encoder"""
 
     def __init__(self,
-                 param_name="cnn.w",
+                 param_name="cnn",
                  win_size=3,
                  ksize=128,
                  act='tanh',
@@ -51,13 +51,15 @@ class CNNEncoder(object):
             filter_size=self.win_size,
             act=self.act,
             pool_type=self.pool_type,
-            param_attr=str(self.param_name))
+            param_attr=self.param_name + ".param",
+            bias_attr=self.param_name + ".bias")
+        
 
 
 class GrnnEncoder(object):
     """ grnn-encoder """
 
-    def __init__(self, param_name="grnn.w", hidden_size=128):
+    def __init__(self, param_name="grnn", hidden_size=128):
         self.param_name = param_name
         self.hidden_size = hidden_size
 
@@ -65,13 +67,15 @@ class GrnnEncoder(object):
         fc0 = nn.fc(
             input=emb, 
             size=self.hidden_size * 3, 
-            param_attr=str(str(self.param_name) + "_fc")
-        )
+            param_attr=self.param_name + "_fc.w",
+            bias_attr=False)
+        
         gru_h = nn.dynamic_gru(
             input=fc0,
             size=self.hidden_size,
             is_reverse=False,
-            param_attr=str(self.param_name))
+            param_attr=self.param_name + ".param",
+            bias_attr=self.param_name + ".bias")
         return nn.sequence_pool(input=gru_h, pool_type='max')
 
 
@@ -139,17 +143,17 @@ class MultiviewSimnet(object):
         # lookup embedding for each slot
         q_embs = [
             nn.embedding(
-                input=query, size=self.emb_shape, param_attr="emb.w")
+                input=query, size=self.emb_shape, param_attr="emb")
             for query in q_slots
         ]
         pt_embs = [
             nn.embedding(
-                input=title, size=self.emb_shape, param_attr="emb.w")
+                input=title, size=self.emb_shape, param_attr="emb")
             for title in pt_slots
         ]
         nt_embs = [
             nn.embedding(
-                input=title, size=self.emb_shape, param_attr="emb.w")
+                input=title, size=self.emb_shape, param_attr="emb")
             for title in nt_slots
         ]
 
@@ -170,9 +174,9 @@ class MultiviewSimnet(object):
         nt_concat = nn.concat(nt_encodes)
 
         # projection of hidden layer
-        q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w')
-        pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w')
-        nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w')
+        q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b')
+        pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b')
+        nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b')
 
         # cosine of hidden layers
         cos_pos = nn.cos_sim(q_hid, pt_hid)
@@ -213,12 +217,12 @@ class MultiviewSimnet(object):
         # lookup embedding for each slot
         q_embs = [
             nn.embedding(
-                input=query, size=self.emb_shape, param_attr="emb.w")
+                input=query, size=self.emb_shape, param_attr="emb")
             for query in q_slots
         ]
         pt_embs = [
             nn.embedding(
-                input=title, size=self.emb_shape, param_attr="emb.w")
+                input=title, size=self.emb_shape, param_attr="emb")
             for title in pt_slots
         ]
         # encode each embedding field with encoder
@@ -232,8 +236,8 @@ class MultiviewSimnet(object):
         q_concat = nn.concat(q_encodes)
         pt_concat = nn.concat(pt_encodes)
         # projection of hidden layer
-        q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w')
-        pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w')
+        q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b')
+        pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b')
         # cosine of hidden layers
         cos = nn.cos_sim(q_hid, pt_hid)
         return cos
diff --git a/fluid/PaddleRec/word2vec/README.cn.md b/fluid/PaddleRec/word2vec/README.cn.md
index 076b3eefdc78a4d7423a80bd1f812dd55c4f085d..7ed9ddc308892f0cdf25641436c25e245035d31b 100644
--- a/fluid/PaddleRec/word2vec/README.cn.md
+++ b/fluid/PaddleRec/word2vec/README.cn.md
@@ -25,6 +25,7 @@ cd data && ./download.sh && cd ..
 ```bash
 python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
 ```
+如果您想使用我们支持的第三方词汇表，请将--other_dict_path设置为您存放将使用的词汇表的目录，并设置--with_other_dict使用它
 
 ## 训练
 训练的命令行选项可以通过`python train.py -h`列出。
diff --git a/fluid/PaddleRec/word2vec/README.md b/fluid/PaddleRec/word2vec/README.md
index 9ed321e62c96cc1dc1725d45bbc228732e339c04..3534fa712ab434d7852d72ba75b0bdcff96b3ca1 100644
--- a/fluid/PaddleRec/word2vec/README.md
+++ b/fluid/PaddleRec/word2vec/README.md
@@ -14,6 +14,11 @@ Download dataset:
 ```bash
 cd data && ./download.sh && cd ..
 ```
+if you would like to use our supported third party vocab, please run:
+
+```bash
+wget http://download.tensorflow.org/models/LM_LSTM_CNN/vocab-2016-09-10.txt
+```
 
 ## Model
 This model implement a skip-gram model of word2vector.
@@ -24,8 +29,10 @@ This model implement a skip-gram model of word2vector.
 Preprocess the training data to generate a word dict.
 
 ```bash
-python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --dict_path data/1-billion_dict
+python preprocess.py --data_path ./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled --is_local --dict_path data/1-billion_dict
 ```
+if you would like to use our supported third party vocab, please set --other_dict_path as the directory of where you
+save the vocab you will use and set --with_other_dict flag on to using it.
 
 ## Train
 The command line options for training can be listed by `python train.py -h`.
diff --git a/fluid/PaddleRec/word2vec/infer.py b/fluid/PaddleRec/word2vec/infer.py
index 2e0c8ea3625a1965872f28d2f936f7c46657aa82..c0dd82ef7f0c3a0c6012e80ea2e85a8b80d44a75 100644
--- a/fluid/PaddleRec/word2vec/infer.py
+++ b/fluid/PaddleRec/word2vec/infer.py
@@ -2,11 +2,9 @@ import time
 import os
 import paddle.fluid as fluid
 import numpy as np
-from Queue import PriorityQueue
 import logging
 import argparse
 import preprocess
-from sklearn.metrics.pairwise import cosine_similarity
 
 word_to_id = dict()
 id_to_word = dict()
@@ -51,8 +49,8 @@ def parse_args():
         '--test_acc',
         action='store_true',
         required=False,
-        default=True,
-        help='if using test_files , (default: True)')
+        default=False,
+        help='if using test_files , (default: False)')
     parser.add_argument(
         '--test_files_dir',
         type=str,
@@ -85,14 +83,12 @@ def build_test_case_from_file(args, emb):
     logger.info("test files list: {}".format(current_list))
     test_cases = list()
     test_labels = list()
+    test_case_descs = list()
     exclude_lists = list()
     for file_dir in current_list:
         with open(args.test_files_dir + "/" + file_dir, 'r') as f:
-            count = 0
             for line in f:
-                if count == 0:
-                    pass
-                elif ':' in line:
+                if ':' in line:
                     logger.info("{}".format(line))
                     pass
                 else:
@@ -102,14 +98,15 @@ def build_test_case_from_file(args, emb):
                             line.split()[2]]]
                     test_case_desc = line.split()[0] + " - " + line.split()[
                         1] + " + " + line.split()[2] + " = " + line.split()[3]
-                    test_cases.append([test_case, test_case_desc])
+                    test_cases.append(test_case)
+                    test_case_descs.append(test_case_desc)
                     test_labels.append(word_to_id[line.split()[3]])
                     exclude_lists.append([
                         word_to_id[line.split()[0]],
                         word_to_id[line.split()[1]], word_to_id[line.split()[2]]
                     ])
-                count += 1
-    return test_cases, test_labels, exclude_lists
+            test_cases = norm(np.array(test_cases))
+    return test_cases, test_case_descs, test_labels, exclude_lists
 
 
 def build_small_test_case(emb):
@@ -133,8 +130,27 @@ def build_small_test_case(emb):
         'deeper']]
     desc5 = "old - older + deeper = deep"
     label5 = word_to_id["deep"]
-    return [[emb1, desc1], [emb2, desc2], [emb3, desc3], [emb4, desc4],
-            [emb5, desc5]], [label1, label2, label3, label4, label5]
+
+    emb6 = emb[word_to_id['boy']]
+    desc6 = "boy"
+    label6 = word_to_id["boy"]
+    emb7 = emb[word_to_id['king']]
+    desc7 = "king"
+    label7 = word_to_id["king"]
+    emb8 = emb[word_to_id['sun']]
+    desc8 = "sun"
+    label8 = word_to_id["sun"]
+    emb9 = emb[word_to_id['key']]
+    desc9 = "key"
+    label9 = word_to_id["key"]
+    test_cases = [emb1, emb2, emb3, emb4, emb5, emb6, emb7, emb8, emb9]
+    test_case_desc = [
+        desc1, desc2, desc3, desc4, desc5, desc6, desc7, desc8, desc9
+    ]
+    test_labels = [
+        label1, label2, label3, label4, label5, label6, label7, label8, label9
+    ]
+    return norm(np.array(test_cases)), test_case_desc, test_labels
 
 
 def build_test_case(args, emb):
@@ -144,86 +160,80 @@ def build_test_case(args, emb):
         return build_small_test_case(emb)
 
 
+def norm(x):
+    y = np.linalg.norm(x, axis=1, keepdims=True)
+    return x / y
+
+
 def inference_test(scope, model_dir, args):
     BuildWord_IdMap(args.dict_path)
     logger.info("model_dir is: {}".format(model_dir + "/"))
     emb = np.array(scope.find_var("embeding").get_tensor())
+    x = norm(emb)
     logger.info("inference result: ====================")
-    test_cases = list()
+    test_cases = None
+    test_case_desc = list()
     test_labels = list()
     exclude_lists = list()
     if args.test_acc:
-        test_cases, test_labels, exclude_lists = build_test_case(args, emb)
+        test_cases, test_case_desc, test_labels, exclude_lists = build_test_case(
+            args, emb)
     else:
-        test_cases, test_labels = build_test_case(args, emb)
+        test_cases, test_case_desc, test_labels = build_test_case(args, emb)
         exclude_lists = [[-1]]
     accual_rank = 1 if args.test_acc else args.rank_num
     correct_num = 0
+    cosine_similarity_matrix = np.dot(test_cases, x.T)
+    results = topKs(accual_rank, cosine_similarity_matrix, exclude_lists,
+                    args.test_acc)
     for i in range(len(test_labels)):
-        pq = None
-        if args.test_acc:
-            pq = topK(
-                accual_rank,
-                emb,
-                test_cases[i][0],
-                exclude_lists[i],
-                is_acc=True)
-        else:
-            pq = pq = topK(
-                accual_rank,
-                emb,
-                test_cases[i][0],
-                exclude_lists[0],
-                is_acc=False)
-        logger.info("Test result for {}".format(test_cases[i][1]))
+        logger.info("Test result for {}".format(test_case_desc[i]))
+        result = results[i]
         for j in range(accual_rank):
-            pq_tmps = pq.get()
-            if (j == accual_rank - 1) and (
-                    pq_tmps.id == test_labels[i]
-            ):  # if the nearest word is what we want 
+            if result[j][1] == test_labels[
+                    i]:  # if the nearest word is what we want 
                 correct_num += 1
-            logger.info("{} nearest is {}, rate is {}".format(
-                accual_rank - j, id_to_word[pq_tmps.id], pq_tmps.priority))
-    acc = correct_num / len(test_labels)
-    logger.info("Test acc is: {}, there are {} / {}}".format(acc, correct_num,
-                                                             len(test_labels)))
-
+            logger.info("{} nearest is {}, rate is {}".format(j, id_to_word[
+                result[j][1]], result[j][0]))
+    logger.info("Test acc is: {}, there are {} / {}".format(correct_num / len(
+        test_labels), correct_num, len(test_labels)))
 
-class PQ_Entry(object):
-    def __init__(self, cos_similarity, id):
-        self.priority = cos_similarity
-        self.id = id
-
-    def __cmp__(self, other):
-        return cmp(self.priority, other.priority)
 
+def topK(k, cosine_similarity_list, exclude_list, is_acc=False):
+    if k == 1 and is_acc:  # accelerate acc calculate
+        max = cosine_similarity_list[0]
+        id = 0
+        for i in range(len(cosine_similarity_list)):
+            if cosine_similarity_list[i] >= max and (i not in exclude_list):
+                max = cosine_similarity_list[i]
+                id = i
+            else:
+                pass
+        return [[max, id]]
+    else:
+        result = list()
+        result_index = np.argpartition(cosine_similarity_list, -k)[-k:]
+        for index in result_index:
+            result.append([cosine_similarity_list[index], index])
+        result.sort(reverse=True)
+        return result
 
-def topK(k, emb, test_emb, exclude_list, is_acc=False):
-    pq = PriorityQueue(k + 1)
-    while not pq.empty():
-        try:
-            pq.get(False)
-        except Empty:
-            continue
-        pq.task_done()
 
-    if len(emb) <= k:
-        for i in range(len(emb)):
-            x = cosine_similarity([emb[i]], [test_emb])
-            pq.put(PQ_Entry(x, i))
-        return pq
+def topKs(k, cosine_similarity_matrix, exclude_lists, is_acc=False):
+    results = list()
+    result_queues = list()
+    correct_num = 0
 
-    for i in range(len(emb)):
-        if is_acc and (i in exclude_list):
-            pass
+    for i in range(cosine_similarity_matrix.shape[0]):
+        tmp_pq = None
+        if is_acc:
+            tmp_pq = topK(k, cosine_similarity_matrix[i], exclude_lists[i],
+                          is_acc)
         else:
-            x = cosine_similarity([emb[i]], [test_emb])
-            pq_e = PQ_Entry(x, i)
-            if pq.full():
-                pq.get()
-            pq.put(pq_e)
-    pq.get()
-    return pq
+            tmp_pq = topK(k, cosine_similarity_matrix[i], exclude_lists[0],
+                          is_acc)
+        result_queues.append(tmp_pq)
+    return result_queues
 
 
 def infer_during_train(args):
@@ -235,8 +245,6 @@ def infer_during_train(args):
     while True:
         time.sleep(60)
         current_list = os.listdir(args.model_output_dir)
-        # logger.info("current_list is : {}".format(current_list))
-        # logger.info("model_file_list is : {}".format(model_file_list))
         if set(model_file_list) == set(current_list):
             if solved_new:
                 solved_new = False
@@ -271,6 +279,8 @@ def infer_once(args):
             fluid.io.load_persistables(
                 executor=exe, dirname=args.model_output_dir + "/")
             inference_test(Scope, args.model_output_dir, args)
+    else:
+        logger.info("Wrong Directory or save model failed!")
 
 
 if __name__ == '__main__':
diff --git a/fluid/PaddleRec/word2vec/network_conf.py b/fluid/PaddleRec/word2vec/network_conf.py
index 5b8e95136a177496a5569d7377d6a2b7f5d30714..16178c339bbcc42c1e5f1e78089a0d0942810444 100644
--- a/fluid/PaddleRec/word2vec/network_conf.py
+++ b/fluid/PaddleRec/word2vec/network_conf.py
@@ -95,8 +95,7 @@ def skip_gram_word2vec(dict_size,
         capacity=64, feed_list=datas, name='py_reader', use_double_buffer=True)
 
     words = fluid.layers.read_file(py_reader)
-
-    emb = fluid.layers.embedding(
+    target_emb = fluid.layers.embedding(
         input=words[0],
         is_sparse=is_sparse,
         size=[dict_size, embedding_size],
@@ -104,16 +103,23 @@ def skip_gram_word2vec(dict_size,
             name='embeding',
             initializer=fluid.initializer.Normal(scale=1 /
                                                  math.sqrt(dict_size))))
-
+    context_emb = fluid.layers.embedding(
+        input=words[1],
+        is_sparse=is_sparse,
+        size=[dict_size, embedding_size],
+        param_attr=fluid.ParamAttr(
+            name='embeding',
+            initializer=fluid.initializer.Normal(scale=1 /
+                                                 math.sqrt(dict_size))))
     cost, cost_nce, cost_hs = None, None, None
 
     if with_nce:
-        cost_nce = nce_layer(emb, words[1], embedding_size, dict_size, 5,
+        cost_nce = nce_layer(target_emb, words[1], embedding_size, dict_size, 5,
                              "uniform", word_frequencys, None)
         cost = cost_nce
     if with_hsigmoid:
-        cost_hs = hsigmoid_layer(emb, words[1], words[2], words[3], dict_size,
-                                 is_sparse)
+        cost_hs = hsigmoid_layer(context_emb, words[0], words[2], words[3],
+                                 dict_size, is_sparse)
         cost = cost_hs
     if with_nce and with_hsigmoid:
         cost = fluid.layers.elementwise_add(cost_nce, cost_hs)
diff --git a/fluid/PaddleRec/word2vec/preprocess.py b/fluid/PaddleRec/word2vec/preprocess.py
index d231f776461ddd23d837257a843ce20dc8839b41..0c2d4b7d29721496db6afd44692ea478964b7ee8 100644
--- a/fluid/PaddleRec/word2vec/preprocess.py
+++ b/fluid/PaddleRec/word2vec/preprocess.py
@@ -3,6 +3,7 @@
 import re
 import six
 import argparse
+import io
 
 prog = re.compile("[^a-z ]", flags=0)
 word_count = dict()
@@ -83,7 +84,6 @@ def native_to_unicode(s):
         return _to_unicode(s)
     except UnicodeDecodeError:
         res = _to_unicode(s, ignore_errors=True)
-        tf.logging.info("Ignoring Unicode error, outputting: %s" % res)
         return res
 
 
@@ -199,34 +199,30 @@ def preprocess(args):
     # word to count
 
     if args.with_other_dict:
-        with open(args.other_dict_path, 'r') as f:
+        with io.open(args.other_dict_path, 'r', encoding='utf-8') as f:
             for line in f:
                 word_count[native_to_unicode(line.strip())] = 1
 
     if args.is_local:
         for i in range(1, 100):
-            with open(args.data_path + "/news.en-000{:0>2d}-of-00100".format(
-                    i)) as f:
+            with io.open(
+                    args.data_path + "/news.en-000{:0>2d}-of-00100".format(i),
+                    encoding='utf-8') as f:
                 for line in f:
                     line = strip_lines(line)
                     words = line.split()
-                    for item in words:
-                        if item in word_count:
-                            word_count[item] = word_count[item] + 1
-                        else:
-                            word_count[native_to_unicode('<UNK>')] += 1
-    # with open(args.data_path + "/tmp.txt") as f:
-    #     for line in f:
-    #         print("line before strip is: {}".format(line))
-    #         line = strip_lines(line, word_count)
-    #         print("line after strip is: {}".format(line))
-    #         words = line.split()
-    #         print("words after split is: {}".format(words))
-    #         for item in words:
-    #             if item in word_count:
-    #                 word_count[item] = word_count[item] + 1
-    #             else:
-    #                 word_count[item] = 1
+                    if args.with_other_dict:
+                        for item in words:
+                            if item in word_count:
+                                word_count[item] = word_count[item] + 1
+                            else:
+                                word_count[native_to_unicode('<UNK>')] += 1
+                    else:
+                        for item in words:
+                            if item in word_count:
+                                word_count[item] = word_count[item] + 1
+                            else:
+                                word_count[item] = 1
     item_to_remove = []
     for item in word_count:
         if word_count[item] <= args.freq:
@@ -236,21 +232,17 @@ def preprocess(args):
 
     path_table, path_code, word_code_len = build_Huffman(word_count, 40)
 
-    with open(args.dict_path, 'w+') as f:
+    with io.open(args.dict_path, 'w+', encoding='utf-8') as f:
         for k, v in word_count.items():
-            f.write(k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n')
+            f.write(k + " " + str(v) + '\n')
 
-    with open(args.dict_path + "_ptable", 'w+') as f2:
+    with io.open(args.dict_path + "_ptable", 'w+', encoding='utf-8') as f2:
         for pk, pv in path_table.items():
-            f2.write(
-                pk.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
-                                                      for x in pv)) + '\n')
+            f2.write(pk + '\t' + ' '.join((str(x) for x in pv)) + '\n')
 
-    with open(args.dict_path + "_pcode", 'w+') as f3:
+    with io.open(args.dict_path + "_pcode", 'w+', encoding='utf-8') as f3:
         for pck, pcv in path_code.items():
-            f3.write(
-                pck.encode("utf-8") + "\t" + ' '.join((str(x).encode("utf-8")
-                                                       for x in pcv)) + '\n')
+            f3.write(pck + '\t' + ' '.join((str(x) for x in pcv)) + '\n')
 
 
 if __name__ == "__main__":
diff --git a/fluid/PaddleRec/word2vec/reader.py b/fluid/PaddleRec/word2vec/reader.py
index ff7d79ec4a99f3d3844f4fb2fd4b4e5edc441392..01d0d8e00488c1df79b4eaff6a7259a11cbc8a8f 100644
--- a/fluid/PaddleRec/word2vec/reader.py
+++ b/fluid/PaddleRec/word2vec/reader.py
@@ -2,14 +2,32 @@
 
 import numpy as np
 import preprocess
-
 import logging
+import io
 
 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger("fluid")
 logger.setLevel(logging.INFO)
 
 
+class NumpyRandomInt(object):
+    def __init__(self, a, b, buf_size=1000):
+        self.idx = 0
+        self.buffer = np.random.random_integers(a, b, buf_size)
+        self.a = a
+        self.b = b
+
+    def __call__(self):
+        if self.idx == len(self.buffer):
+            self.buffer = np.random.random_integers(self.a, self.b,
+                                                    len(self.buffer))
+            self.idx = 0
+
+        result = self.buffer[self.idx]
+        self.idx += 1
+        return result
+
+
 class Word2VecReader(object):
     def __init__(self,
                  dict_path,
@@ -24,6 +42,7 @@ class Word2VecReader(object):
         self.num_non_leaf = 0
         self.word_to_id_ = dict()
         self.id_to_word = dict()
+        self.word_count = dict()
         self.word_to_path = dict()
         self.word_to_code = dict()
         self.trainer_id = trainer_id
@@ -33,20 +52,19 @@ class Word2VecReader(object):
         word_counts = []
         word_id = 0
 
-        with open(dict_path, 'r') as f:
+        with io.open(dict_path, 'r', encoding='utf-8') as f:
             for line in f:
-                line = line.decode(encoding='UTF-8')
                 word, count = line.split()[0], int(line.split()[1])
+                self.word_count[word] = count
                 self.word_to_id_[word] = word_id
                 self.id_to_word[word_id] = word  #build id to word dict
                 word_id += 1
                 word_counts.append(count)
                 word_all_count += count
 
-        with open(dict_path + "_word_to_id_", 'w+') as f6:
+        with io.open(dict_path + "_word_to_id_", 'w+', encoding='utf-8') as f6:
             for k, v in self.word_to_id_.items():
-                f6.write(
-                    k.encode("utf-8") + " " + str(v).encode("utf-8") + '\n')
+                f6.write(k + " " + str(v) + '\n')
 
         self.dict_size = len(self.word_to_id_)
         self.word_frequencys = [
@@ -55,22 +73,22 @@ class Word2VecReader(object):
         print("dict_size = " + str(
             self.dict_size)) + " word_all_count = " + str(word_all_count)
 
-        with open(dict_path + "_ptable", 'r') as f2:
+        with io.open(dict_path + "_ptable", 'r', encoding='utf-8') as f2:
             for line in f2:
-                self.word_to_path[line.split("\t")[0]] = np.fromstring(
+                self.word_to_path[line.split('\t')[0]] = np.fromstring(
                     line.split('\t')[1], dtype=int, sep=' ')
                 self.num_non_leaf = np.fromstring(
                     line.split('\t')[1], dtype=int, sep=' ')[0]
         print("word_ptable dict_size = " + str(len(self.word_to_path)))
 
-        with open(dict_path + "_pcode", 'r') as f3:
+        with io.open(dict_path + "_pcode", 'r', encoding='utf-8') as f3:
             for line in f3:
-                line = line.decode(encoding='UTF-8')
-                self.word_to_code[line.split("\t")[0]] = np.fromstring(
+                self.word_to_code[line.split('\t')[0]] = np.fromstring(
                     line.split('\t')[1], dtype=int, sep=' ')
         print("word_pcode dict_size = " + str(len(self.word_to_code)))
+        self.random_generator = NumpyRandomInt(1, self.window_size_ + 1)
 
-    def get_context_words(self, words, idx, window_size):
+    def get_context_words(self, words, idx):
         """
         Get the context word list of target word.
 
@@ -78,31 +96,34 @@ class Word2VecReader(object):
         idx: input word index
         window_size: window size
         """
-        target_window = np.random.randint(1, window_size + 1)
-        # need to keep in mind that maybe there are no enough words before the target word.
-        start_point = idx - target_window if (idx - target_window) > 0 else 0
+        target_window = self.random_generator()
+        start_point = idx - target_window  # if (idx - target_window) > 0 else 0
+        if start_point < 0:
+            start_point = 0
         end_point = idx + target_window
-        # context words of the target word
-        targets = set(words[start_point:idx] + words[idx + 1:end_point + 1])
-        return list(targets)
+        targets = words[start_point:idx] + words[idx + 1:end_point + 1]
+
+        return set(targets)
 
     def train(self, with_hs):
         def _reader():
             for file in self.filelist:
-                with open(self.data_path_ + "/" + file, 'r') as f:
+                with io.open(
+                        self.data_path_ + "/" + file, 'r',
+                        encoding='utf-8') as f:
                     logger.info("running data in {}".format(self.data_path_ +
                                                             "/" + file))
                     count = 1
                     for line in f:
                         if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line)
+                            line = preprocess.strip_lines(line, self.word_count)
                             word_ids = [
                                 self.word_to_id_[word] for word in line.split()
                                 if word in self.word_to_id_
                             ]
                             for idx, target_id in enumerate(word_ids):
                                 context_word_ids = self.get_context_words(
-                                    word_ids, idx, self.window_size_)
+                                    word_ids, idx)
                                 for context_id in context_word_ids:
                                     yield [target_id], [context_id]
                         else:
@@ -111,26 +132,28 @@ class Word2VecReader(object):
 
         def _reader_hs():
             for file in self.filelist:
-                with open(self.data_path_ + "/" + file, 'r') as f:
+                with io.open(
+                        self.data_path_ + "/" + file, 'r',
+                        encoding='utf-8') as f:
                     logger.info("running data in {}".format(self.data_path_ +
                                                             "/" + file))
                     count = 1
                     for line in f:
                         if self.trainer_id == count % self.trainer_num:
-                            line = preprocess.strip_lines(line)
+                            line = preprocess.strip_lines(line, self.word_count)
                             word_ids = [
                                 self.word_to_id_[word] for word in line.split()
                                 if word in self.word_to_id_
                             ]
                             for idx, target_id in enumerate(word_ids):
                                 context_word_ids = self.get_context_words(
-                                    word_ids, idx, self.window_size_)
+                                    word_ids, idx)
                                 for context_id in context_word_ids:
                                     yield [target_id], [context_id], [
-                                        self.word_to_code[self.id_to_word[
+                                        self.word_to_path[self.id_to_word[
                                             target_id]]
                                     ], [
-                                        self.word_to_path[self.id_to_word[
+                                        self.word_to_code[self.id_to_word[
                                             target_id]]
                                     ]
                         else:
@@ -144,13 +167,20 @@ class Word2VecReader(object):
 
 
 if __name__ == "__main__":
-    window_size = 10
+    window_size = 5
+
+    reader = Word2VecReader(
+        "./data/1-billion_dict",
+        "./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled/",
+        ["news.en-00001-of-00100"], 0, 1)
 
-    reader = Word2VecReader("data/enwik9_dict", "data/enwik9", window_size)
     i = 0
-    for x, y in reader.train()():
+    # print(reader.train(True))
+    for x, y, z, f in reader.train(True)():
         print("x: " + str(x))
         print("y: " + str(y))
+        print("path: " + str(z))
+        print("code: " + str(f))
         print("\n")
         if i == 10:
             exit(0)
diff --git a/fluid/PaddleRec/word2vec/train.py b/fluid/PaddleRec/word2vec/train.py
index 58f984303b19497c1e68207b5784f38d2f61b239..ec4be60f420bd2c41800167c6f5e72f0d7dec790 100644
--- a/fluid/PaddleRec/word2vec/train.py
+++ b/fluid/PaddleRec/word2vec/train.py
@@ -12,7 +12,7 @@ os.environ["CUDA_VISIBLE_DEVICES"] = ""
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.executor import global_scope
-
+import six
 import reader
 from network_conf import skip_gram_word2vec
 from infer import inference_test
@@ -29,7 +29,7 @@ def parse_args():
         '--train_data_path',
         type=str,
         default='./data/1-billion-word-language-modeling-benchmark-r13output/training-monolingual.tokenized.shuffled',
-        help="The path of training dataset")
+        help="The path of taining dataset")
     parser.add_argument(
         '--dict_path',
         type=str,
@@ -43,7 +43,7 @@ def parse_args():
     parser.add_argument(
         '--batch_size',
         type=int,
-        default=100,
+        default=1000,
         help="The size of mini-batch (default:100)")
     parser.add_argument(
         '--num_passes',
@@ -125,14 +125,44 @@ def parse_args():
     return parser.parse_args()
 
 
+def convert_python_to_tensor(batch_size, sample_reader, is_hs):
+    def __reader__():
+        result = None
+        if is_hs:
+            result = [[], [], [], []]
+        else:
+            result = [[], []]
+        for sample in sample_reader():
+            for i, fea in enumerate(sample):
+                result[i].append(fea)
+            if len(result[0]) == batch_size:
+                tensor_result = []
+                for tensor in result:
+                    t = fluid.Tensor()
+                    dat = np.array(tensor, dtype='int64')
+                    if len(dat.shape) > 2:
+                        dat = dat.reshape((dat.shape[0], dat.shape[2]))
+                    elif len(dat.shape) == 1:
+                        dat = dat.reshape((-1, 1))
+                    t.set(dat, fluid.CPUPlace())
+
+                    tensor_result.append(t)
+                yield tensor_result
+                if is_hs:
+                    result = [[], [], [], []]
+                else:
+                    result = [[], []]
+
+    return __reader__
+
+
 def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
-            reader.train((args.with_hs or (not args.with_nce))),
-            buf_size=args.batch_size * 100),
-        batch_size=args.batch_size)
 
-    py_reader.decorate_paddle_reader(train_reader)
+    py_reader.decorate_tensor_provider(
+        convert_python_to_tensor(args.batch_size,
+                                 reader.train((args.with_hs or (
+                                     not args.with_nce))), (args.with_hs or (
+                                         not args.with_nce))))
 
     place = fluid.CPUPlace()
 
@@ -140,6 +170,7 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
     exe.run(fluid.default_startup_program())
 
     exec_strategy = fluid.ExecutionStrategy()
+    exec_strategy.use_experimental_executor = True
 
     print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
     exec_strategy.num_threads = int(os.getenv("CPU_NUM"))
@@ -161,32 +192,23 @@ def train_loop(args, train_program, reader, py_reader, loss, trainer_id):
     profiler_step_end = 30
 
     for pass_id in range(args.num_passes):
-        epoch_start = time.time()
         py_reader.start()
+        time.sleep(10)
+        epoch_start = time.time()
         batch_id = 0
         start = time.clock()
 
         try:
             while True:
 
-                if profiler_step == profiler_step_start:
-                    fluid.profiler.start_profiler(profile_state)
-
                 loss_val = train_exe.run(fetch_list=[loss.name])
                 loss_val = np.mean(loss_val)
 
-                if profiler_step == profiler_step_end:
-                    fluid.profiler.stop_profiler('total', 'trainer_profile.log')
-                    profiler_step += 1
-                else:
-                    profiler_step += 1
-
                 if batch_id % 50 == 0:
                     logger.info(
                         "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
                         format(pass_id, batch_id,
-                               loss_val.mean() / args.batch_size,
-                               py_reader.queue.size()))
+                               loss_val.mean(), py_reader.queue.size()))
                 if args.with_speed:
                     if batch_id % 1000 == 0 and batch_id != 0:
                         elapsed = (time.clock() - start)
@@ -256,7 +278,7 @@ def train(args):
 
     optimizer = None
     if args.with_Adam:
-        optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
+        optimizer = fluid.optimizer.Adam(learning_rate=1e-4, lazy_mode=True)
     else:
         optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
 
diff --git a/legacy/README.md b/legacy/README.md
index f7741c9b7b1c39e569e74606d054847b27a206d8..f0719c1a26c04341e8de327143dc826248bb3607 100644
--- a/legacy/README.md
+++ b/legacy/README.md
@@ -1,3 +1,6 @@
+
+# 该目录的模型已经不再维护，不推荐使用。建议使用Fluid目录下的模型。
+
 # Introduction to models
 
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://github.com/PaddlePaddle/models)