diff --git a/fluid/language_model/.run_ce.sh b/fluid/language_model/.run_ce.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5ee2d8aa0582b2b8504f9ba645b6252aa75f23bf
--- /dev/null
+++ b/fluid/language_model/.run_ce.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+cudaid=${language_model:=0} # use 0-th card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py --enable_ce | python _ce.py
+
+cudaid=${language_model_m:=0,1,2,3} # use 0,1,2,3 card as default
+export CUDA_VISIBLE_DEVICES=$cudaid
+
+FLAGS_benchmark=true  python train.py --enable_ce | python _ce.py
diff --git a/fluid/language_model/_ce.py b/fluid/language_model/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4999d7a1e14e333f1c7056b3dc2c5b506682ec6
--- /dev/null
+++ b/fluid/language_model/_ce.py
@@ -0,0 +1,62 @@
+# this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi
+from kpi import DurationKpi
+
+imikolov_20_avg_ppl_kpi = CostKpi('imikolov_20_avg_ppl', 0.2, 0)
+imikolov_20_pass_duration_kpi = DurationKpi(
+    'imikolov_20_pass_duration', 0.02, 0, actived=True)
+imikolov_20_avg_ppl_kpi_card4 = CostKpi('imikolov_20_avg_ppl_card4', 0.2, 0)
+imikolov_20_pass_duration_kpi_card4 = DurationKpi(
+    'imikolov_20_pass_duration_card4', 0.03, 0, actived=True)
+
+tracking_kpis = [
+    imikolov_20_avg_ppl_kpi,
+    imikolov_20_pass_duration_kpi,
+    imikolov_20_avg_ppl_kpi_card4,
+    imikolov_20_pass_duration_kpi_card4,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    log_to_ce(log)
diff --git a/fluid/language_model/train.py b/fluid/language_model/train.py
index 59fc3a987746af7aec9b61b5c817400b6b6546d0..f3e7a7398bf13e14c74ce1d10d90b7bf34031698 100644
--- a/fluid/language_model/train.py
+++ b/fluid/language_model/train.py
@@ -1,14 +1,28 @@
+import os
 import sys
 import time
 
 import numpy as np
 import math
-
+import argparse
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 
 import utils
 
+SEED = 102
+
+
+def parse_args():
+    parser = argparse.ArgumentParser("language_model benchmark.")
+    parser.add_argument(
+        '--enable_ce',
+        action='store_true',
+        help='If set, run \
+        the task with continuous evaluation logs.')
+    args = parser.parse_args()
+    return args
+
 
 def network(src, dst, vocab_size, hid_size, init_low_bound, init_high_bound):
     """ network definition """
@@ -63,31 +77,26 @@ def train(train_reader,
           init_low_bound=-0.04,
           init_high_bound=0.04):
     """ train network """
+
+    args = parse_args()
+    if args.enable_ce:
+        # random seed must set before configuring the network.
+        fluid.default_startup_program().random_seed = SEED
     vocab_size = len(vocab)
 
+    #Input data
     src_wordseq = fluid.layers.data(
         name="src_wordseq", shape=[1], dtype="int64", lod_level=1)
     dst_wordseq = fluid.layers.data(
         name="dst_wordseq", shape=[1], dtype="int64", lod_level=1)
 
+    # Train program
     avg_cost = None
-    if not parallel:
-        cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
-                       init_low_bound, init_high_bound)
-        avg_cost = fluid.layers.mean(x=cost)
-    else:
-        places = fluid.layers.get_places()
-        pd = fluid.layers.ParallelDo(places)
-        with pd.do():
-            cost = network(
-                pd.read_input(src_wordseq),
-                pd.read_input(dst_wordseq), vocab_size, hid_size,
-                init_low_bound, init_high_bound)
-            pd.write_output(cost)
-
-        cost = pd()
-        avg_cost = fluid.layers.mean(x=cost)
+    cost = network(src_wordseq, dst_wordseq, vocab_size, hid_size,
+                   init_low_bound, init_high_bound)
+    avg_cost = fluid.layers.mean(x=cost)
 
+    # Optimization to minimize lost
     sgd_optimizer = fluid.optimizer.SGD(
         learning_rate=fluid.layers.exponential_decay(
             learning_rate=base_lr,
@@ -96,39 +105,56 @@ def train(train_reader,
             staircase=True))
     sgd_optimizer.minimize(avg_cost)
 
+    # Initialize executor
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
-
     exe.run(fluid.default_startup_program())
+
+    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
+
     total_time = 0.0
+    fetch_list = [avg_cost.name]
     for pass_idx in xrange(pass_num):
         epoch_idx = pass_idx + 1
         print "epoch_%d start" % epoch_idx
 
         t0 = time.time()
         i = 0
+        newest_ppl = 0
         for data in train_reader():
             i += 1
             lod_src_wordseq = utils.to_lodtensor(
                 map(lambda x: x[0], data), place)
             lod_dst_wordseq = utils.to_lodtensor(
                 map(lambda x: x[1], data), place)
-            ret_avg_cost = exe.run(fluid.default_main_program(),
-                                   feed={
-                                       "src_wordseq": lod_src_wordseq,
-                                       "dst_wordseq": lod_dst_wordseq
-                                   },
-                                   fetch_list=[avg_cost],
-                                   use_program_cache=True)
-            avg_ppl = math.exp(ret_avg_cost[0])
+            ret_avg_cost = train_exe.run(feed={
+                "src_wordseq": lod_src_wordseq,
+                "dst_wordseq": lod_dst_wordseq
+            },
+                                         fetch_list=fetch_list)
+            avg_ppl = np.exp(ret_avg_cost[0])
+            newest_ppl = np.mean(avg_ppl)
             if i % 100 == 0:
-                print "step:%d ppl:%.3f" % (i, avg_ppl)
+                print "step:%d ppl:%.3f" % (i, newest_ppl)
 
         t1 = time.time()
         total_time += t1 - t0
         print "epoch:%d num_steps:%d time_cost(s):%f" % (epoch_idx, i,
                                                          total_time / epoch_idx)
 
+        if pass_idx == pass_num - 1 and args.enable_ce:
+            #Note: The following logs are special for CE monitoring.
+            #Other situations do not need to care about these logs.
+            gpu_num = get_cards()
+            if gpu_num == 1:
+                print("kpis	imikolov_20_pass_duration	%s" %
+                      (total_time / epoch_idx))
+                print("kpis	imikolov_20_avg_ppl	%s" % newest_ppl)
+            else:
+                print("kpis	imikolov_20_pass_duration_card%s	%s" % \
+                                (gpu_num, total_time / epoch_idx))
+                print("kpis	imikolov_20_avg_ppl_card%s	%s" %
+                      (gpu_num, newest_ppl))
         save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
         feed_var_names = ["src_wordseq", "dst_wordseq"]
         fetch_vars = [avg_cost]
@@ -138,11 +164,22 @@ def train(train_reader,
     print("finish training")
 
 
+def get_cards(enable_ce):
+    if enable_ce:
+        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
+        num = len(cards.split(","))
+        return num
+    else:
+        return fluid.core.get_cuda_device_count()
+
+
 def train_net():
     """ do training """
     batch_size = 20
+    args = parse_args()
     vocab, train_reader, test_reader = utils.prepare_data(
-        batch_size=batch_size, buffer_size=1000, word_freq_threshold=0)
+        batch_size=batch_size * get_cards(args.enable_ce), buffer_size=1000, \
+        word_freq_threshold=0, enable_ce = args.enable_ce)
     train(
         train_reader=train_reader,
         vocab=vocab,
@@ -152,7 +189,7 @@ def train_net():
         batch_size=batch_size,
         pass_num=12,
         use_cuda=True,
-        parallel=False,
+        parallel=True,
         model_dir="model",
         init_low_bound=-0.1,
         init_high_bound=0.1)
diff --git a/fluid/language_model/utils.py b/fluid/language_model/utils.py
index c5909046176586556a2aedba5dd5d12810b3ea8d..dd03a89835e620dc8432a6ca16392fc5173a12d4 100644
--- a/fluid/language_model/utils.py
+++ b/fluid/language_model/utils.py
@@ -3,7 +3,7 @@ import time
 import numpy as np
 
 import paddle.fluid as fluid
-import paddle.v2 as paddle
+import paddle
 
 
 def to_lodtensor(data, place):
@@ -22,17 +22,28 @@ def to_lodtensor(data, place):
     return res
 
 
-def prepare_data(batch_size, buffer_size=1000, word_freq_threshold=0):
+def prepare_data(batch_size,
+                 buffer_size=1000,
+                 word_freq_threshold=0,
+                 enable_ce=False):
     """ prepare the English Pann Treebank (PTB) data """
     vocab = paddle.dataset.imikolov.build_dict(word_freq_threshold)
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
+    if enable_ce:
+        train_reader = paddle.batch(
             paddle.dataset.imikolov.train(
                 vocab,
                 buffer_size,
                 data_type=paddle.dataset.imikolov.DataType.SEQ),
-            buf_size=buffer_size),
-        batch_size)
+            batch_size)
+    else:
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.imikolov.train(
+                    vocab,
+                    buffer_size,
+                    data_type=paddle.dataset.imikolov.DataType.SEQ),
+                buf_size=buffer_size),
+            batch_size)
     test_reader = paddle.batch(
         paddle.dataset.imikolov.test(
             vocab, buffer_size, data_type=paddle.dataset.imikolov.DataType.SEQ),
diff --git a/fluid/neural_machine_translation/rnn_search/_ce.py b/fluid/neural_machine_translation/rnn_search/_ce.py
index 32b13c66a2bbb5bed7e71c7924dea9da4e4182fe..e948336e82141c4a2072a02f73b51cb7b4396ca0 100644
--- a/fluid/neural_machine_translation/rnn_search/_ce.py
+++ b/fluid/neural_machine_translation/rnn_search/_ce.py
@@ -7,9 +7,9 @@ from kpi import CostKpi, DurationKpi, AccKpi
 
 #### NOTE kpi.py should shared in models in some way!!!!
 
-train_cost_kpi = CostKpi('train_cost', 0.02, actived=True)
-test_cost_kpi = CostKpi('test_cost', 0.005, actived=True)
-train_duration_kpi = DurationKpi('train_duration', 0.06, actived=True)
+train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
+test_cost_kpi = CostKpi('test_cost', 0.005, 0, actived=True)
+train_duration_kpi = DurationKpi('train_duration', 0.06, 0, actived=True)
 
 tracking_kpis = [
     train_cost_kpi,
diff --git a/fluid/neural_machine_translation/rnn_search/train.py b/fluid/neural_machine_translation/rnn_search/train.py
index 989d8bb686a561f4903b4346cd0bb9b29a23e8e7..ade0dd751af1a2e83bb99da22281061dce44fbd1 100644
--- a/fluid/neural_machine_translation/rnn_search/train.py
+++ b/fluid/neural_machine_translation/rnn_search/train.py
@@ -151,9 +151,9 @@ def train():
 
         # This log is for continuous evaluation only
         if args.enable_ce:
-            print("kpis    train_cost      %f" % avg_cost_train)
-            print("kpis    test_cost       %f" % test_loss)
-            print("kpis    train_duration  %f" % time_consumed)
+            print("kpis\ttrain_cost\t%f" % avg_cost_train)
+            print("kpis\ttest_cost\t%f" % test_loss)
+            print("kpis\ttrain_duration\t%f" % time_consumed)
 
         if pass_id % args.save_interval == 0:
             model_path = os.path.join(args.save_dir, str(pass_id))
diff --git a/fluid/object_detection/.move.sh b/fluid/object_detection/.move.sh
deleted file mode 100644
index d27f72663727d89e487a3eac9eec110818b28a97..0000000000000000000000000000000000000000
--- a/fluid/object_detection/.move.sh
+++ /dev/null
@@ -1 +0,0 @@
-cp -r ./data/pascalvoc/. /home/.cache/paddle/dataset/pascalvoc
diff --git a/fluid/object_detection/.run.sh b/fluid/object_detection/.run.sh
deleted file mode 100644
index 7be97aaf3dfa5344e872e8d12b0ff0d8f5405df3..0000000000000000000000000000000000000000
--- a/fluid/object_detection/.run.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-export MKL_NUM_THREADS=1
-export OMP_NUM_THREADS=1
-cudaid=${object_detection_cudaid:=0} # use 0-th card as default
-export CUDA_VISIBLE_DEVICES=$cudaid
-
-if [ ! -d "/root/.cache/paddle/dataset/pascalvoc" ];then
-    mkdir -p /root/.cache/paddle/dataset/pascalvoc
-    ./data/pascalvoc/download.sh
-    bash ./.move.sh
-fi
-FLAGS_benchmark=true  python train.py --batch_size=64 --num_passes=2 --for_model_ce=True --data_dir=/root/.cache/paddle/dataset/pascalvoc/ 
diff --git a/fluid/object_detection/.run_ce.sh b/fluid/object_detection/.run_ce.sh
new file mode 100755
index 0000000000000000000000000000000000000000..50809e77043e0eb0bb5f6bf5a9904d8113c85756
--- /dev/null
+++ b/fluid/object_detection/.run_ce.sh
@@ -0,0 +1,19 @@
+###!/bin/bash
+####This file is only used for continuous evaluation.
+
+export MKL_NUM_THREADS=1
+export OMP_NUM_THREADS=1
+
+if [ ! -d "/root/.cache/paddle/dataset/pascalvoc" ];then
+    mkdir -p /root/.cache/paddle/dataset/pascalvoc
+    ./data/pascalvoc/download.sh
+    cp -r ./data/pascalvoc/. /home/.cache/paddle/dataset/pascalvoc
+fi
+
+cudaid=${object_detection_cudaid:=0}
+export CUDA_VISIBLE_DEVICES=$cudaid
+FLAGS_benchmark=true  python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py
+
+cudaid=${object_detection_cudaid:=0,1,2,3}
+export CUDA_VISIBLE_DEVICES=$cudaid
+FLAGS_benchmark=true  python train.py --enable_ce=True --batch_size=64 --num_passes=2 --data_dir=/root/.cache/paddle/dataset/pascalvoc/ | python _ce.py
diff --git a/fluid/object_detection/_ce.py b/fluid/object_detection/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f17ff324d8c4bb1d0cecca2401e584a7ec5e3af
--- /dev/null
+++ b/fluid/object_detection/_ce.py
@@ -0,0 +1,72 @@
+####this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+#### NOTE kpi.py should shared in models in some way!!!!
+
+train_cost_kpi = CostKpi('train_cost', 0.02, 0, actived=True)
+test_acc_kpi = AccKpi('test_acc', 0.01, 0, actived=True)
+train_speed_kpi = AccKpi('train_speed', 0.2, 0, actived=True)
+train_cost_card4_kpi = CostKpi('train_cost_card4', 0.02, 0, actived=True)
+test_acc_card4_kpi = AccKpi('test_acc_card4', 0.01, 0, actived=True)
+train_speed_card4_kpi = AccKpi('train_speed_card4', 0.2, 0, actived=True)
+
+tracking_kpis = [
+    train_cost_kpi,
+    test_acc_kpi,
+    train_speed_kpi,
+    train_cost_card4_kpi,
+    test_acc_card4_kpi,
+    train_speed_card4_kpi,
+]
+
+
+def parse_log(log):
+    '''
+    This method should be implemented by model developers.
+
+    The suggestion:
+
+    each line in the log should be key, value, for example:
+
+    "
+    train_cost\t1.0
+    test_cost\t1.0
+    train_cost\t1.0
+    train_cost\t1.0
+    train_acc\t1.2
+    "
+    '''
+    #kpi_map = {}
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            #kpi_map[kpi_name] = kpi_value
+            yield kpi_name, kpi_value
+    #return kpi_map
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
diff --git a/fluid/object_detection/mobilenet_ssd.py b/fluid/object_detection/mobilenet_ssd.py
index c39883196056aede5d410554e14a0198e540d754..b87c0558447397e0a5b6a7a1e689a316d1ee8e14 100644
--- a/fluid/object_detection/mobilenet_ssd.py
+++ b/fluid/object_detection/mobilenet_ssd.py
@@ -1,4 +1,3 @@
-import paddle.v2 as paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
diff --git a/fluid/object_detection/train.py b/fluid/object_detection/train.py
index 24225ab31d063cd55df86d7a31d46434578b4b6a..aadcc904f55f077c06630a1f8e27a6bf4b422c05 100644
--- a/fluid/object_detection/train.py
+++ b/fluid/object_detection/train.py
@@ -23,7 +23,7 @@ add_arg('dataset',          str,   'pascalvoc', "coco2014, coco2017, and pascalv
 add_arg('model_save_dir',   str,   'model',     "The path to save model.")
 add_arg('pretrained_model', str,   'pretrained/ssd_mobilenet_v1_coco/', "The init model path.")
 add_arg('apply_distort',    bool,  True,   "Whether apply distort.")
-add_arg('apply_expand',     bool,  True,  "Whether appley expand.")
+add_arg('apply_expand',     bool,  True,   "Whether apply expand.")
 add_arg('nms_threshold',    float, 0.45,   "NMS threshold.")
 add_arg('ap_version',       str,   '11point',   "integral, 11point.")
 add_arg('resize_h',         int,   300,    "The resized image height.")
@@ -32,10 +32,8 @@ add_arg('mean_value_B',     float, 127.5,  "Mean value for B channel which will
 add_arg('mean_value_G',     float, 127.5,  "Mean value for G channel which will be subtracted.")  #116.78
 add_arg('mean_value_R',     float, 127.5,  "Mean value for R channel which will be subtracted.")  #103.94
 add_arg('is_toy',           int,   0, "Toy for quick debug, 0 means using all data, while n means using only n sample.")
-add_arg('for_model_ce',     bool,  False, "Use CE to evaluate the model")
 add_arg('data_dir',         str,   'data/pascalvoc', "data directory")
-add_arg('skip_batch_num',   int,    5,  "the num of minibatch to skip.")
-add_arg('iterations',       int,   120,  "mini batchs.")
+add_arg('enable_ce',     bool,  False, "Whether use CE to evaluate the model")
 #yapf: enable
 
 
@@ -48,6 +46,9 @@ def train(args,
           num_passes,
           model_save_dir,
           pretrained_model=None):
+    if args.enable_ce:
+        fluid.framework.default_startup_program().random_seed = 111
+
     image_shape = [3, data_args.resize_h, data_args.resize_w]
     if 'coco' in data_args.dataset:
         num_classes = 91
@@ -121,8 +122,12 @@ def train(args,
         train_exe = fluid.ParallelExecutor(
             use_cuda=args.use_gpu, loss_name=loss.name)
 
-    train_reader = paddle.batch(
-        reader.train(data_args, train_file_list), batch_size=batch_size)
+    if not args.enable_ce:
+        train_reader = paddle.batch(
+            reader.train(data_args, train_file_list), batch_size=batch_size)
+    else:
+        train_reader = paddle.batch(
+            reader.train(data_args, train_file_list, False), batch_size=batch_size)
     test_reader = paddle.batch(
         reader.test(data_args, val_file_list), batch_size=batch_size)
     feeder = fluid.DataFeeder(
@@ -140,32 +145,32 @@ def train(args,
     def test(pass_id, best_map):
         _, accum_map = map_eval.get_map_var()
         map_eval.reset(exe)
+        every_pass_map=[]
         for batch_id, data in enumerate(test_reader()):
             test_map, = exe.run(test_program,
                                feed=feeder.feed(data),
                                fetch_list=[accum_map])
             if batch_id % 20 == 0:
+                every_pass_map.append(test_map)
                 print("Batch {0}, map {1}".format(batch_id, test_map))
+        mean_map = np.mean(every_pass_map)
         if test_map[0] > best_map:
             best_map = test_map[0]
             save_model('best_model')
         print("Pass {0}, test map {1}".format(pass_id, test_map))
-        return best_map
+        return best_map, mean_map
 
-    train_num = 0
-    total_train_time = 0.0
+    total_time = 0.0
     for pass_id in range(num_passes):
+        epoch_idx = pass_id + 1
         start_time = time.time()
         prev_start_time = start_time
-        # end_time = 0
         every_pass_loss = []
         iter = 0
         pass_duration = 0.0
         for batch_id, data in enumerate(train_reader()):
             prev_start_time = start_time
             start_time = time.time()
-            if args.for_model_ce and iter == args.iterations:
-                break
             if len(data) < (devices_num * 2):
                 print("There are too few data to train on all devices.")
                 continue
@@ -176,34 +181,31 @@ def train(args,
                 loss_v, = exe.run(fluid.default_main_program(),
                                   feed=feeder.feed(data),
                                   fetch_list=[loss])
-            # end_time = time.time()
             loss_v = np.mean(np.array(loss_v))
+            every_pass_loss.append(loss_v)
             if batch_id % 20 == 0:
                 print("Pass {0}, batch {1}, loss {2}, time {3}".format(
                     pass_id, batch_id, loss_v, start_time - prev_start_time))
 
-            if args.for_model_ce and iter >= args.skip_batch_num or pass_id != 0:
-                batch_duration = time.time() - start_time
-                pass_duration += batch_duration
-                train_num += len(data)
-                every_pass_loss.append(loss_v)
-                iter += 1
-        total_train_time += pass_duration
-
-        if args.for_model_ce and pass_id == num_passes - 1:
-            examples_per_sec = train_num / total_train_time
-            cost = np.mean(every_pass_loss)
-            with open("train_speed_factor.txt", 'w') as f:
-                f.write('{:f}\n'.format(examples_per_sec))
-            with open("train_cost_factor.txt", 'a+') as f:
-                f.write('{:f}\n'.format(cost))
-
-        best_map = test(pass_id, best_map)
+        end_time = time.time()
+        best_map, mean_map = test(pass_id, best_map)
+        if args.enable_ce and pass_id == 1:
+            total_time += end_time - start_time
+            train_avg_loss = np.mean(every_pass_loss)
+            if devices_num == 1:
+                print ("kpis    train_cost        %s" % train_avg_loss)
+                print ("kpis    test_acc          %s" % mean_map)
+                print ("kpis    train_speed       %s" % (total_time / epoch_idx))
+            else:
+                print ("kpis    train_cost_card%s   %s" % (devices_num, train_avg_loss))
+                print ("kpis    test_acc_card%s     %s" % (devices_num, mean_map))
+                print ("kpis    train_speed_card%s  %f" % (devices_num, total_time / epoch_idx))
+
+
         if pass_id % 10 == 0 or pass_id == num_passes - 1:
             save_model(str(pass_id))
     print("Best test map {0}".format(best_map))
 
-
 if __name__ == '__main__':
     args = parser.parse_args()
     print_arguments(args)
diff --git a/fluid/sequence_tagging_for_ner/train.py b/fluid/sequence_tagging_for_ner/train.py
index 6ed77cd5ca1d504a8b79b4f87349242b5051c539..8aaf660f35aa63b7d298d1d380b579a66f421de7 100644
--- a/fluid/sequence_tagging_for_ner/train.py
+++ b/fluid/sequence_tagging_for_ner/train.py
@@ -53,7 +53,7 @@ def main(train_data_file, test_data_file, vocab_file, target_file, emb_file,
         chunk_scheme="IOB",
         num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
 
-    inference_program = fluid.default_main_program().clone()
+    inference_program = fluid.default_main_program().clone(for_test=True)
     with fluid.program_guard(inference_program):
         test_target = chunk_evaluator.metrics + chunk_evaluator.states
         inference_program = fluid.io.get_inference_program(test_target)
diff --git a/fluid/text_classification/.run_ce.sh b/fluid/text_classification/.run_ce.sh
new file mode 100755
index 0000000000000000000000000000000000000000..777ba02e8c78760452a6fb7d4ac1dc4a82d62594
--- /dev/null
+++ b/fluid/text_classification/.run_ce.sh
@@ -0,0 +1,5 @@
+###!/bin/bash
+####This file is only used for continuous evaluation.
+
+export CE_MODE_X=1
+python train.py cnn  | python _ce.py
diff --git a/fluid/text_classification/_ce.py b/fluid/text_classification/_ce.py
new file mode 100644
index 0000000000000000000000000000000000000000..100357204db7f3a8d0c1d3cbcbdc707410b20023
--- /dev/null
+++ b/fluid/text_classification/_ce.py
@@ -0,0 +1,48 @@
+####this file is only used for continuous evaluation test!
+
+import os
+import sys
+sys.path.append(os.environ['ceroot'])
+from kpi import CostKpi, DurationKpi, AccKpi
+
+#### NOTE kpi.py should shared in models in some way!!!!
+
+train_acc_kpi = AccKpi('train_acc', 0.005, actived=True)
+train_cost_kpi = CostKpi('train_cost', 0.005, actived=True)
+train_duration_kpi = DurationKpi('train_duration', 0.05, actived=True)
+
+tracking_kpis = [
+    train_acc_kpi,
+    train_cost_kpi,
+    train_duration_kpi,
+]
+
+
+def parse_log(log):
+    for line in log.split('\n'):
+        fs = line.strip().split('\t')
+        print(fs)
+        if len(fs) == 3 and fs[0] == 'kpis':
+            print("-----%s" % fs)
+            kpi_name = fs[1]
+            kpi_value = float(fs[2])
+            yield kpi_name, kpi_value
+
+
+def log_to_ce(log):
+    kpi_tracker = {}
+    for kpi in tracking_kpis:
+        kpi_tracker[kpi.name] = kpi
+
+    for (kpi_name, kpi_value) in parse_log(log):
+        print(kpi_name, kpi_value)
+        kpi_tracker[kpi_name].add_record(kpi_value)
+        kpi_tracker[kpi_name].persist()
+
+
+if __name__ == '__main__':
+    log = sys.stdin.read()
+    print("*****")
+    print(log)
+    print("****")
+    log_to_ce(log)
diff --git a/fluid/text_classification/train.py b/fluid/text_classification/train.py
index dc164671e785b758365885b98788fae71d5f8a87..698e4dc0788f2e185810a4f782ac4dcff1f60c81 100644
--- a/fluid/text_classification/train.py
+++ b/fluid/text_classification/train.py
@@ -1,3 +1,4 @@
+import os
 import sys
 import time
 import unittest
@@ -53,8 +54,12 @@ def train(train_reader,
     exe = fluid.Executor(place)
     feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
 
+    # For internal continuous evaluation
+    if 'CE_MODE_X' in os.environ:
+        fluid.default_startup_program().random_seed = 110
     exe.run(fluid.default_startup_program())
     for pass_id in xrange(pass_num):
+        pass_start = time.time()
         data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
         for data in train_reader():
             avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(),
@@ -73,6 +78,13 @@ def train(train_reader,
         epoch_model = save_dirname + "/" + "epoch" + str(pass_id)
         fluid.io.save_inference_model(epoch_model, ["words", "label"], acc, exe)
 
+        pass_end = time.time()
+        # For internal continuous evaluation
+        if 'CE_MODE_X' in os.environ:
+            print("kpis	train_acc	%f" % avg_acc)
+            print("kpis	train_cost	%f" % avg_cost)
+            print("kpis	train_duration	%f" % (pass_end - pass_start))
+
 
 def train_net():
     word_dict, train_reader, test_reader = utils.prepare_data(
diff --git a/fluid/text_classification/utils.py b/fluid/text_classification/utils.py
index fba14dde63d27ada07d8fcd69cacfb631559e613..874679c3e2f9fe0c640d6da4f25d503023adcb65 100644
--- a/fluid/text_classification/utils.py
+++ b/fluid/text_classification/utils.py
@@ -1,3 +1,4 @@
+import os
 import sys
 import time
 import numpy as np
@@ -64,15 +65,22 @@ def prepare_data(data_type="imdb",
             raise RuntimeError("No such dataset")
 
     if data_type == "imdb":
-        train_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.imdb.train(word_dict), buf_size=buf_size),
-            batch_size=batch_size)
+        if 'CE_MODE_X' in os.environ:
+            train_reader = paddle.batch(
+                paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
 
-        test_reader = paddle.batch(
-            paddle.reader.shuffle(
-                paddle.dataset.imdb.test(word_dict), buf_size=buf_size),
-            batch_size=batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.imdb.test(word_dict), batch_size=batch_size)
+        else:
+            train_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.imdb.train(word_dict), buf_size=buf_size),
+                batch_size=batch_size)
+
+            test_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.imdb.test(word_dict), buf_size=buf_size),
+                batch_size=batch_size)
     else:
         raise RuntimeError("no such dataset")