diff --git a/fluid/PaddleCV/deeplabv3+/.run_ce.sh b/fluid/PaddleCV/deeplabv3+/.run_ce.sh
new file mode 100755
index 0000000000000000000000000000000000000000..540fb964ba94fd29dc28bb51342cdba839d433e7
--- /dev/null
+++ b/fluid/PaddleCV/deeplabv3+/.run_ce.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+DATASET_PATH=${HOME}/.cache/paddle/dataset/cityscape/
+
+cudaid=${deeplabv3plus:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py \
+--batch_size=2 \
+--train_crop_size=769 \
+--total_step=50 \
+--save_weights_path=output1 \
+--dataset_path=$DATASET_PATH \
+--enable_ce | python _ce.py
+
+cudaid=${deeplabv3plus_m:=0,1,2,3} # use 0,1,2,3 card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py \
+--batch_size=2 \
+--train_crop_size=769 \
+--total_step=50 \
+--save_weights_path=output4 \
+--dataset_path=$DATASET_PATH \
+--enable_ce | python _ce.py
diff --git a/fluid/PaddleCV/deeplabv3+/__init__.py b/fluid/PaddleCV/deeplabv3+/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/fluid/PaddleCV/deeplabv3+/_ce.py b/fluid/PaddleCV/deeplabv3+/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0127d6445213b9d3934220fa36e9eb44d3e04b4
--- /dev/null
+++ b/fluid/PaddleCV/deeplabv3+/_ce.py
@@ -0,0 +1,60 @@
+# this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi
+from kpi import DurationKpi
+
+each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.1, 0, actived=True)
+train_loss_card1_kpi = CostKpi('train_loss_card1', 0.05, 0)
+each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.1, 0, actived=True)
+train_loss_card4_kpi = CostKpi('train_loss_card4', 0.05, 0)
+
+tracking_kpis = [
+        each_pass_duration_card1_kpi,
+        train_loss_card1_kpi,
+        each_pass_duration_card4_kpi,
+        train_loss_card4_kpi,
+        ]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
diff --git a/fluid/PaddleCV/deeplabv3+/train.py b/fluid/PaddleCV/deeplabv3+/train.py
old mode 100644
new mode 100755
index 817d53d173467f9146918ec9bb6b44141eb0ac3f..e009f76e0e16be9e4a5db532615cefac258fada1
--- a/fluid/PaddleCV/deeplabv3+/train.py
+++ b/fluid/PaddleCV/deeplabv3+/train.py
@@ -34,6 +34,10 @@ def add_arguments():
     add_argument('parallel', bool, False, "using ParallelExecutor.")
     add_argument('use_gpu', bool, True, "Whether use GPU or CPU.")
     add_argument('num_classes', int, 19, "Number of classes.")
+    parser.add_argument(
+        '--enable_ce',
+        action='store_true',
+        help='If set, run the task with continuous evaluation logs.')
 
 
 def load_model():
@@ -51,7 +55,10 @@ def load_model():
     else:
         if args.num_classes == 19:
             fluid.io.load_params(
-                exe, dirname=args.init_weights_path, main_program=tp)
+                exe,
+                dirname="",
+                filename=args.init_weights_path,
+                main_program=tp)
         else:
             fluid.io.load_vars(
                 exe, dirname="", filename=args.init_weights_path, vars=myvars)
@@ -84,6 +91,15 @@ def loss(logit, label):
     return loss, label_nignore
 
 
+def get_cards(args):
+    if args.enable_ce:
+        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
+        num = len(cards.split(","))
+        return num
+    else:
+        return args.num_devices
+
+
 CityscapeDataset = reader.CityscapeDataset
 parser = argparse.ArgumentParser()
 
@@ -99,6 +115,13 @@ deeplabv3p = models.deeplabv3p
 
 sp = fluid.Program()
 tp = fluid.Program()
+
+# only for ce
+if args.enable_ce:
+    SEED = 102
+    sp.random_seed = SEED
+    tp.random_seed = SEED
+
 crop_size = args.train_crop_size
 batch_size = args.batch_size
 image_shape = [crop_size, crop_size]
@@ -155,7 +178,13 @@ if args.parallel:
 
 batches = dataset.get_batch_generator(batch_size, total_step)
 
+total_time = 0.0
+epoch_idx = 0
+train_loss = 0
+
 for i, imgs, labels, names in batches:
+    epoch_idx += 1
+    begin_time = time.time()
     prev_start_time = time.time()
     if args.parallel:
         retv = exe_p.run(fetch_list=[pred.name, loss_mean.name],
@@ -167,11 +196,21 @@ for i, imgs, labels, names in batches:
                              'label': labels},
                        fetch_list=[pred, loss_mean])
     end_time = time.time()
+    total_time += end_time - begin_time
     if i % 100 == 0:
         print("Model is saved to", args.save_weights_path)
         save_model()
     print("step {:d}, loss: {:.6f}, step_time_cost: {:.3f}".format(
         i, np.mean(retv[1]), end_time - prev_start_time))
 
+    # only for ce
+    train_loss = np.mean(retv[1])
+
+if args.enable_ce:
+    gpu_num = get_cards(args)
+    print("kpis\teach_pass_duration_card%s\t%s" %
+          (gpu_num, total_time / epoch_idx))
+    print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, train_loss))
+
 print("Training done. Model is saved to", args.save_weights_path)
 save_model()
diff --git a/fluid/PaddleCV/image_classification/.run_ce.sh b/fluid/PaddleCV/image_classification/.run_ce.sh
index 9ba9a4c2c6779694f0e87e12ca85b59afa33f1c0..cc0d894a634bc0add12fd83840990eacf77382cc 100755
--- a/fluid/PaddleCV/image_classification/.run_ce.sh
+++ b/fluid/PaddleCV/image_classification/.run_ce.sh
@@ -7,6 +7,7 @@ cudaid=${object_detection_cudaid:=0}
 export CUDA_VISIBLE_DEVICES=$cudaid
 python train.py --batch_size=${BATCH_SIZE} --num_epochs=5 --enable_ce=True --lr_strategy=cosine_decay | python _ce.py
 
+BATCH_SIZE=224
 cudaid=${object_detection_cudaid_m:=0, 1, 2, 3}
 export CUDA_VISIBLE_DEVICES=$cudaid
 python train.py --batch_size=${BATCH_SIZE} --num_epochs=5 --enable_ce=True --lr_strategy=cosine_decay | python _ce.py
diff --git a/fluid/PaddleCV/image_classification/train.py b/fluid/PaddleCV/image_classification/train.py
index ba3c94d129965684e17f692faf653ffa15984371..8e53699ed09d1b8e2e55eb4b35e7a6bb41294720 100644
--- a/fluid/PaddleCV/image_classification/train.py
+++ b/fluid/PaddleCV/image_classification/train.py
@@ -242,7 +242,7 @@ def train(args):
         device_num = subprocess.check_output(['nvidia-smi', '-L']).decode().count('\n')
 
     train_batch_size = args.batch_size / device_num
-    test_batch_size = 8
+    test_batch_size = 16
     if not args.enable_ce:
         train_reader = paddle.batch(
             reader.train(), batch_size=train_batch_size, drop_last=True)
diff --git a/fluid/PaddleCV/object_detection/data_util.py b/fluid/PaddleCV/object_detection/data_util.py
index ac022593119e0008c3f7f3858303cbf5bc717650..e7d6b2b43eee5048fb5d3d8397a3e88aa0f14b49 100644
--- a/fluid/PaddleCV/object_detection/data_util.py
+++ b/fluid/PaddleCV/object_detection/data_util.py
@@ -68,6 +68,7 @@ class GeneratorEnqueuer(object):
                         try:
                             task()
                         except Exception:
+                            traceback.print_exc()
                             self._stop_event.set()
                             break
             else:
@@ -75,6 +76,7 @@ class GeneratorEnqueuer(object):
                     try:
                         task()
                     except Exception:
+                        traceback.print_exc()
                         self._stop_event.set()
                         break
 
diff --git a/fluid/PaddleCV/object_detection/reader.py b/fluid/PaddleCV/object_detection/reader.py
index 59da1b38fb2e9cce8bb99a2773e7fc222ee33bd8..6acc18594e5979308a7ba641002569b0867516a8 100644
--- a/fluid/PaddleCV/object_detection/reader.py
+++ b/fluid/PaddleCV/object_detection/reader.py
@@ -176,10 +176,17 @@ def coco(settings, file_list, mode, batch_size, shuffle):
         if mode == 'train' and shuffle:
             np.random.shuffle(images)
         batch_out = []
+        if '2014' in file_list:
+            sub_dir = "train2014" if model == "train" else "val2014"
+        elif '2017' in file_list:
+            sub_dir = "train2017" if mode == "train" else "val2017"
+        data_dir = os.path.join(settings.data_dir, sub_dir)
         for image in images:
             image_name = image['file_name']
-            image_path = os.path.join(settings.data_dir, image_name)
-
+            image_path = os.path.join(data_dir, image_name)
+            if not os.path.exists(image_path):
+                raise ValueError("%s is not exist, you should specify "
+                                 "data path correctly." % image_path)
             im = Image.open(image_path)
             if im.mode == 'L':
                 im = im.convert('RGB')
@@ -242,7 +249,9 @@ def pascalvoc(settings, file_list, mode, batch_size, shuffle):
             image_path, label_path = image.split()
             image_path = os.path.join(settings.data_dir, image_path)
             label_path = os.path.join(settings.data_dir, label_path)
-
+            if not os.path.exists(image_path):
+                raise ValueError("%s is not exist, you should specify "
+                                 "data path correctly." % image_path)
             im = Image.open(image_path)
             if im.mode == 'L':
                 im = im.convert('RGB')
@@ -295,7 +304,6 @@ def train(settings,
           max_queue=24,
           enable_ce=False):
     file_list = os.path.join(settings.data_dir, file_list)
-
     if 'coco' in settings.dataset:
         generator = coco(settings, file_list, "train", batch_size, shuffle)
     else:
@@ -341,6 +349,9 @@ def test(settings, file_list, batch_size):
 
 def infer(settings, image_path):
     def reader():
+        if not os.path.exists(image_path):
+            raise ValueError("%s is not exist, you should specify "
+                             "data path correctly." % image_path)
         img = Image.open(image_path)
         if img.mode == 'L':
             img = im.convert('RGB')
diff --git a/fluid/PaddleNLP/chinese_ner/infer.py b/fluid/PaddleNLP/chinese_ner/infer.py
index e22832d38bc5308444201bd302798cf18cae7d99..a15fdb53d89f2f7845e6bb54aa32fe922bb64682 100644
--- a/fluid/PaddleNLP/chinese_ner/infer.py
+++ b/fluid/PaddleNLP/chinese_ner/infer.py
@@ -52,7 +52,7 @@ def parse_args():
 
 def print_arguments(args):
     print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
+    for arg, value in sorted(vars(args).items()):
         print('%s: %s' % (arg, value))
     print('------------------------------------------------')
 
@@ -61,6 +61,7 @@ def load_reverse_dict(dict_path):
     return dict((idx, line.strip().split("\t")[0])
                 for idx, line in enumerate(open(dict_path, "r").readlines()))
 
+
 def to_lodtensor(data, place):
     seq_lens = [len(seq) for seq in data]
     cur_len = 0
@@ -76,7 +77,6 @@ def to_lodtensor(data, place):
     return res
 
 
-
 def infer(args):
     word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
     mention = fluid.layers.data(
@@ -108,8 +108,8 @@ def infer(args):
                 profiler.reset_profiler()
             iters = 0
             for data in test_data():
-                word = to_lodtensor(map(lambda x: x[0], data), place)
-                mention = to_lodtensor(map(lambda x: x[1], data), place)
+                word = to_lodtensor(list(map(lambda x: x[0], data)), place)
+                mention = to_lodtensor(list(map(lambda x: x[1], data)), place)
 
                 start = time.time()
                 crf_decode = exe.run(inference_program,
@@ -122,12 +122,12 @@ def infer(args):
                 np_data = np.array(crf_decode[0])
                 word_count = 0
                 assert len(data) == len(lod_info) - 1
-                for sen_index in xrange(len(data)):
+                for sen_index in range(len(data)):
                     assert len(data[sen_index][0]) == lod_info[
                         sen_index + 1] - lod_info[sen_index]
                     word_index = 0
-                    for tag_index in xrange(lod_info[sen_index],
-                                            lod_info[sen_index + 1]):
+                    for tag_index in range(lod_info[sen_index],
+                                           lod_info[sen_index + 1]):
                         word = str(data[sen_index][0][word_index])
                         gold_tag = label_reverse_dict[data[sen_index][2][
                             word_index]]
diff --git a/fluid/PaddleNLP/chinese_ner/train.py b/fluid/PaddleNLP/chinese_ner/train.py
index 7e59d2ed0793ae9499fc2a6618e762a9ac426800..7684f683e77b35be84b5753793f97308c7763cd8 100644
--- a/fluid/PaddleNLP/chinese_ner/train.py
+++ b/fluid/PaddleNLP/chinese_ner/train.py
@@ -65,7 +65,7 @@ def parse_args():
 
 def print_arguments(args):
     print('-----------  Configuration Arguments -----------')
-    for arg, value in sorted(vars(args).iteritems()):
+    for arg, value in sorted(vars(args).items()):
         print('%s: %s' % (arg, value))
     print('------------------------------------------------')
 
@@ -220,9 +220,9 @@ def test2(exe, chunk_evaluator, inference_program, test_data, place,
           cur_fetch_list):
     chunk_evaluator.reset()
     for data in test_data():
-        word = to_lodtensor(map(lambda x: x[0], data), place)
-        mention = to_lodtensor(map(lambda x: x[1], data), place)
-        target = to_lodtensor(map(lambda x: x[2], data), place)
+        word = to_lodtensor(list(map(lambda x: x[0], data)), place)
+        mention = to_lodtensor(list(map(lambda x: x[1], data)), place)
+        target = to_lodtensor(list(map(lambda x: x[2], data)), place)
         result_list = exe.run(
             inference_program,
             feed={"word": word,
@@ -232,8 +232,9 @@ def test2(exe, chunk_evaluator, inference_program, test_data, place,
         number_infer = np.array(result_list[0])
         number_label = np.array(result_list[1])
         number_correct = np.array(result_list[2])
-        chunk_evaluator.update(number_infer[0], number_label[0],
-                               number_correct[0])
+        chunk_evaluator.update(number_infer[0].astype('int64'),
+                               number_label[0].astype('int64'),
+                               number_correct[0].astype('int64'))
     return chunk_evaluator.eval()
 
 
@@ -241,9 +242,9 @@ def test(test_exe, chunk_evaluator, inference_program, test_data, place,
          cur_fetch_list):
     chunk_evaluator.reset()
     for data in test_data():
-        word = to_lodtensor(map(lambda x: x[0], data), place)
-        mention = to_lodtensor(map(lambda x: x[1], data), place)
-        target = to_lodtensor(map(lambda x: x[2], data), place)
+        word = to_lodtensor(list(map(lambda x: x[0], data)), place)
+        mention = to_lodtensor(list(map(lambda x: x[1], data)), place)
+        target = to_lodtensor(list(map(lambda x: x[2], data)), place)
         result_list = test_exe.run(
             fetch_list=cur_fetch_list,
             feed={"word": word,
@@ -252,8 +253,9 @@ def test(test_exe, chunk_evaluator, inference_program, test_data, place,
         number_infer = np.array(result_list[0])
         number_label = np.array(result_list[1])
         number_correct = np.array(result_list[2])
-        chunk_evaluator.update(number_infer.sum(),
-                               number_label.sum(), number_correct.sum())
+        chunk_evaluator.update(number_infer.sum().astype('int64'),
+                               number_label.sum().astype('int64'),
+                               number_correct.sum().astype('int64'))
     return chunk_evaluator.eval()
 
 
@@ -270,11 +272,6 @@ def main(args):
         crf_decode = fluid.layers.crf_decoding(
             input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
 
-        inference_program = fluid.default_main_program().clone(for_test=True)
-
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
-        sgd_optimizer.minimize(avg_cost)
-
         (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
          num_correct_chunks) = fluid.layers.chunk_eval(
              input=crf_decode,
@@ -282,6 +279,11 @@ def main(args):
              chunk_scheme="IOB",
              num_chunk_types=int(math.ceil((args.label_dict_len - 1) / 2.0)))
 
+        inference_program = fluid.default_main_program().clone(for_test=True)
+
+        sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3)
+        sgd_optimizer.minimize(avg_cost)
+
         chunk_evaluator = fluid.metrics.ChunkEvaluator()
 
         train_reader = paddle.batch(
@@ -312,7 +314,7 @@ def main(args):
             test_exe = exe
 
         batch_id = 0
-        for pass_id in xrange(args.num_passes):
+        for pass_id in range(args.num_passes):
             chunk_evaluator.reset()
             train_reader_iter = train_reader()
             start_time = time.time()
@@ -326,9 +328,9 @@ def main(args):
                         ],
                         feed=feeder.feed(cur_batch))
                     chunk_evaluator.update(
-                        np.array(nums_infer).sum(),
-                        np.array(nums_label).sum(),
-                        np.array(nums_correct).sum())
+                        np.array(nums_infer).sum().astype("int64"),
+                        np.array(nums_label).sum().astype("int64"),
+                        np.array(nums_correct).sum().astype("int64"))
                     cost_list = np.array(cost)
                     batch_id += 1
                 except StopIteration:
diff --git a/fluid/PaddleNLP/deep_attention_matching_net/_ce.py b/fluid/PaddleNLP/deep_attention_matching_net/_ce.py
index 0c38c0a3d1b0fc0a240a7bae928d9c07f8b95886..7ad30288074da3124c33fad6c96fd369a812c77c 100644
--- a/fluid/PaddleNLP/deep_attention_matching_net/_ce.py
+++ b/fluid/PaddleNLP/deep_attention_matching_net/_ce.py
@@ -7,8 +7,8 @@ from kpi import CostKpi, DurationKpi, AccKpi
 
 #### NOTE kpi.py should shared in models in some way!!!!
 
-train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
-train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True)
+train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
+train_duration_kpi = DurationKpi('train_duration', 0.05, 0, actived=True)
 
 tracking_kpis = [
     train_cost_kpi,
diff --git a/fluid/PaddleNLP/machine_reading_comprehension/_ce.py b/fluid/PaddleNLP/machine_reading_comprehension/_ce.py
index cff13c8722007987a3cd82f1298206248963e45a..a425fe951fb587749f31b18959917cdeed76a41d 100644
--- a/fluid/PaddleNLP/machine_reading_comprehension/_ce.py
+++ b/fluid/PaddleNLP/machine_reading_comprehension/_ce.py
@@ -3,6 +3,7 @@
 import os
 import sys
 #sys.path.insert(0, os.environ['ceroot'])
+sys.path.append(os.environ['ceroot'])
 from kpi import CostKpi, DurationKpi, AccKpi
 
 #### NOTE kpi.py should shared in models in some way!!!!
diff --git a/fluid/PaddleNLP/machine_reading_comprehension/dataset.py b/fluid/PaddleNLP/machine_reading_comprehension/dataset.py
index 3aaf87be9a7b0659fa9e79eb8329911cbea73c55..c732ce041c5e82ea5e1471ba422f5b056a7cba8f 100644
--- a/fluid/PaddleNLP/machine_reading_comprehension/dataset.py
+++ b/fluid/PaddleNLP/machine_reading_comprehension/dataset.py
@@ -23,6 +23,7 @@ import json
 import logging
 import numpy as np
 from collections import Counter
+import io
 
 
 class BRCDataset(object):
@@ -67,7 +68,7 @@ class BRCDataset(object):
         Args:
             data_path: the data file to load
         """
-        with open(data_path) as fin:
+        with io.open(data_path, 'r', encoding='utf-8') as fin:
             data_set = []
             for lidx, line in enumerate(fin):
                 sample = json.loads(line.strip())
diff --git a/fluid/PaddleNLP/machine_reading_comprehension/run.py b/fluid/PaddleNLP/machine_reading_comprehension/run.py
index dbe3a4b9a296fdaf089d55be3f0c9845422f0ce5..74561297f003faa4b3d871c0f327b65da63e81e7 100644
--- a/fluid/PaddleNLP/machine_reading_comprehension/run.py
+++ b/fluid/PaddleNLP/machine_reading_comprehension/run.py
@@ -22,6 +22,7 @@ import os
 import random
 import json
 import six
+import multiprocessing
 
 import paddle
 import paddle.fluid as fluid
diff --git a/fluid/PaddleNLP/neural_machine_translation/transformer/train.py b/fluid/PaddleNLP/neural_machine_translation/transformer/train.py
index 0e9c18416f62c85e76dd060f1fad44073e5841fc..5fc98868aa6e36bc5d1c5c0ad7ab231cda0fd52d 100644
--- a/fluid/PaddleNLP/neural_machine_translation/transformer/train.py
+++ b/fluid/PaddleNLP/neural_machine_translation/transformer/train.py
@@ -469,7 +469,7 @@ def train_loop(exe,
     # For faster executor
     exec_strategy = fluid.ExecutionStrategy()
     exec_strategy.use_experimental_executor = True
-    # exec_strategy.num_iteration_per_drop_scope = 5
+    exec_strategy.num_iteration_per_drop_scope = int(args.fetch_steps)
     build_strategy = fluid.BuildStrategy()
     # Since the token number differs among devices, customize gradient scale to
     # use token average cost among multi-devices. and the gradient scale is
diff --git a/fluid/PaddleNLP/text_classification/train.py b/fluid/PaddleNLP/text_classification/train.py
index 159266f3956b950afa200e9f53c9fdc6c36309aa..174636f06ec5fe07180347745f910166140e9eed 100644
--- a/fluid/PaddleNLP/text_classification/train.py
+++ b/fluid/PaddleNLP/text_classification/train.py
@@ -89,7 +89,7 @@ def train(train_reader,
 
 def train_net():
     word_dict, train_reader, test_reader = utils.prepare_data(
-        "imdb", self_dict=False, batch_size=4, buf_size=50000)
+        "imdb", self_dict=False, batch_size=128, buf_size=50000)
 
     if sys.argv[1] == "bow":
         train(
diff --git a/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh b/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh
old mode 100644
new mode 100755
index eca247a40a3f680a6a59c4a183bfba006ced8d44..f1bb7febd3f2c572544612baf24be14c711108e3
--- a/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh
+++ b/fluid/PaddleNLP/text_matching_on_quora/.run_ce.sh
@@ -6,9 +6,9 @@ export OMP_NUM_THREADS=1
 cudaid=${text_matching_on_quora:=0} # use 0-th card as default
 export CUDA_VISIBLE_DEVICES=$cudaid
 
-FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py
+FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce --epoch_num=5 | python _ce.py
 
 cudaid=${text_matching_on_quora_m:=0,1,2,3} # use 0,1,2,3 card as default
 export CUDA_VISIBLE_DEVICES=$cudaid
 
-FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce | python _ce.py
+FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce --epoch_num=5 | python _ce.py
diff --git a/fluid/PaddleNLP/text_matching_on_quora/_ce.py b/fluid/PaddleNLP/text_matching_on_quora/_ce.py
index b38ad21a1e0eb7407f78d100a3cb3659f6c5d8d3..eadeb821da6f7049d1916a65a1ae4eb995c5cb6d 100644
--- a/fluid/PaddleNLP/text_matching_on_quora/_ce.py
+++ b/fluid/PaddleNLP/text_matching_on_quora/_ce.py
@@ -7,11 +7,11 @@ from kpi import CostKpi
 from kpi import DurationKpi
 
 
-each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.05, 0, actived=True)
-train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.2, 0)
+each_pass_duration_card1_kpi = DurationKpi('each_pass_duration_card1', 0.08, 0, actived=True)
+train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.08, 0)
 train_avg_acc_card1_kpi = CostKpi('train_avg_acc_card1', 0.02, 0)
-each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.05, 0, actived=True)
-train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.2, 0)
+each_pass_duration_card4_kpi = DurationKpi('each_pass_duration_card4', 0.08, 0, actived=True)
+train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.08, 0)
 train_avg_acc_card4_kpi = CostKpi('train_avg_acc_card4', 0.02, 0)
 
 tracking_kpis = [
diff --git a/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py b/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py
index 714fa6f970d9f213efdc6b6e1799b244696fb20d..0f88c6b6ef13aec25e08527b7efabe8638a3af25 100755
--- a/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py
+++ b/fluid/PaddleNLP/text_matching_on_quora/train_and_evaluate.py
@@ -34,6 +34,7 @@ parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument('--model_name',       type=str,   default='cdssmNet',                  help="Which model to train")
 parser.add_argument('--config',           type=str,   default='cdssm_base',       help="The global config setting")
 parser.add_argument('--enable_ce', action='store_true', help='If set, run the task with continuous evaluation logs.')
+parser.add_argument('--epoch_num', type=int, help='Number of epoch')
 
 DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset')
 
@@ -241,6 +242,9 @@ def main():
     args = parser.parse_args()
     global_config = configs.__dict__[args.config]()
 
+    if args.epoch_num != None:
+        global_config.epoch_num = args.epoch_num
+
     print("net_name: ", args.model_name)
     net = models.__dict__[args.model_name](global_config)
 
diff --git a/fluid/PaddleRec/multiview_simnet/nets.py b/fluid/PaddleRec/multiview_simnet/nets.py
index 41e366f55c80c5151102ed5e81a2746774fb3b4b..fed177844bdd247d163aee9e8625cd0ec74378b3 100644
--- a/fluid/PaddleRec/multiview_simnet/nets.py
+++ b/fluid/PaddleRec/multiview_simnet/nets.py
@@ -33,7 +33,7 @@ class CNNEncoder(object):
     """ cnn-encoder"""
 
     def __init__(self,
-                 param_name="cnn.w",
+                 param_name="cnn",
                  win_size=3,
                  ksize=128,
                  act='tanh',
@@ -51,13 +51,15 @@ class CNNEncoder(object):
             filter_size=self.win_size,
             act=self.act,
             pool_type=self.pool_type,
-            param_attr=str(self.param_name))
+            param_attr=self.param_name + ".param",
+            bias_attr=self.param_name + ".bias")
+        
 
 
 class GrnnEncoder(object):
     """ grnn-encoder """
 
-    def __init__(self, param_name="grnn.w", hidden_size=128):
+    def __init__(self, param_name="grnn", hidden_size=128):
         self.param_name = param_name
         self.hidden_size = hidden_size
 
@@ -65,13 +67,15 @@ class GrnnEncoder(object):
         fc0 = nn.fc(
             input=emb, 
             size=self.hidden_size * 3, 
-            param_attr=str(str(self.param_name) + "_fc")
-        )
+            param_attr=self.param_name + "_fc.w",
+            bias_attr=False)
+        
         gru_h = nn.dynamic_gru(
             input=fc0,
             size=self.hidden_size,
             is_reverse=False,
-            param_attr=str(self.param_name))
+            param_attr=self.param_name + ".param",
+            bias_attr=self.param_name + ".bias")
         return nn.sequence_pool(input=gru_h, pool_type='max')
 
 
@@ -139,17 +143,17 @@ class MultiviewSimnet(object):
         # lookup embedding for each slot
         q_embs = [
             nn.embedding(
-                input=query, size=self.emb_shape, param_attr="emb.w")
+                input=query, size=self.emb_shape, param_attr="emb")
             for query in q_slots
         ]
         pt_embs = [
             nn.embedding(
-                input=title, size=self.emb_shape, param_attr="emb.w")
+                input=title, size=self.emb_shape, param_attr="emb")
             for title in pt_slots
         ]
         nt_embs = [
             nn.embedding(
-                input=title, size=self.emb_shape, param_attr="emb.w")
+                input=title, size=self.emb_shape, param_attr="emb")
             for title in nt_slots
         ]
 
@@ -170,9 +174,9 @@ class MultiviewSimnet(object):
         nt_concat = nn.concat(nt_encodes)
 
         # projection of hidden layer
-        q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w')
-        pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w')
-        nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w')
+        q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b')
+        pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b')
+        nt_hid = nn.fc(nt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b')
 
         # cosine of hidden layers
         cos_pos = nn.cos_sim(q_hid, pt_hid)
@@ -213,12 +217,12 @@ class MultiviewSimnet(object):
         # lookup embedding for each slot
         q_embs = [
             nn.embedding(
-                input=query, size=self.emb_shape, param_attr="emb.w")
+                input=query, size=self.emb_shape, param_attr="emb")
             for query in q_slots
         ]
         pt_embs = [
             nn.embedding(
-                input=title, size=self.emb_shape, param_attr="emb.w")
+                input=title, size=self.emb_shape, param_attr="emb")
             for title in pt_slots
         ]
         # encode each embedding field with encoder
@@ -232,8 +236,8 @@ class MultiviewSimnet(object):
         q_concat = nn.concat(q_encodes)
         pt_concat = nn.concat(pt_encodes)
         # projection of hidden layer
-        q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w')
-        pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w')
+        q_hid = nn.fc(q_concat, size=self.hidden_size, param_attr='q_fc.w', bias_attr='q_fc.b')
+        pt_hid = nn.fc(pt_concat, size=self.hidden_size, param_attr='t_fc.w', bias_attr='t_fc.b')
         # cosine of hidden layers
         cos = nn.cos_sim(q_hid, pt_hid)
         return cos