diff --git a/PaddleCV/3d_vision/PointNet++/ext_op/pointnet_lib.py b/PaddleCV/3d_vision/PointNet++/ext_op/pointnet_lib.py
index 5f607bf8775a6c0b20440f1635ae1c05b2ad8f07..cd4692d3ef7b9f4ef659ecf7e46764ae553f0dcf 100644
--- a/PaddleCV/3d_vision/PointNet++/ext_op/pointnet_lib.py
+++ b/PaddleCV/3d_vision/PointNet++/ext_op/pointnet_lib.py
@@ -53,8 +53,8 @@ def three_nn(input, known, eps=1e-10, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[16, 3], dtype='float32')
-            known = fluid.layers.data(name='known', shape=[32, 3], dtype='float32')
+            x = fluid.data(name='x', shape=[None, 16, 3], dtype='float32')
+            known = fluid.data(name='known', shape=[None, 32, 3], dtype='float32')
             distance, idx = fluid.layers.three_nn(input, known)
     """
     helper = LayerHelper('three_nn', **locals())
@@ -97,9 +97,9 @@ def three_interp(input, weight, idx, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[16, 3], dtype='float32')
-            weight = fluid.layers.data(name='weight', shape=[32, 3], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[32, 3], dtype='int32')
+            x = fluid.data(name='x', shape=[None, 16, 3], dtype='float32')
+            weight = fluid.data(name='weight', shape=[None, 32, 3], dtype='float32')
+            index = fluid.data(name='index', shape=[None, 32, 3], dtype='int32')
             out = fluid.layers.three_interp(x, weight, index)
     """
     helper = LayerHelper('three_interp', **locals())
@@ -132,8 +132,8 @@ def query_ball(input, new_points, radius, n_sample):
         .. code-block::python
 
             import paddle.fluid as fluid
-            x = fluid.layers.data(name='points',shape=[-1,5,3],dtype='float32')
-            new_points = fluid.layers.data(name='new_points', shape=[-1,2,3], dtype='float32')
+            x = fluid.data(name='points',shape=[None,5,3],dtype='float32')
+            new_points = fluid.data(name='new_points', shape=[None,2,3], dtype='float32')
             output = fluid.layers.query_ball(x,new_points,radius=4.0,n_sample=5)
 
 
@@ -167,7 +167,7 @@ def farthest_point_sampling(input, sampled_point_num):
 
     Examples:
         .. code-block:: python
-        x = fluid.layers.data(name='data', shape=(2,100,3), dtype='float32')
+        x = fluid.data(name='data', shape=(None ,100, 3), dtype='float32')
         sampled_points = fluid.layers.farthest_point_sampling(
             x, 50
         )
@@ -210,8 +210,8 @@ def gather_point(input, index):
     Examples:
         .. code-block:: python
             import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[-1, 5, 3], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[-1, 1], dtype='int32')
+            x = fluid.data(name='x', shape=[None, 5, 3], dtype='float32')
+            index = fluid.data(name='index', shape=[None, 1], dtype='int32')
             output = fluid.layers.gather_point(x, index)
     """
 
@@ -249,8 +249,8 @@ def group_points(input, idx, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
-            x = fluid.layers.data(name='x', shape=[16, 3], dtype='float32')
-            index = fluid.layers.data(name='index', shape=[32, 3], dtype='int32')
+            x = fluid.data(name='x', shape=[None, 16, 3], dtype='float32')
+            index = fluid.data(name='index', shape=[None, 32, 3], dtype='int32')
             out  = fluid.layers.group_points(x, index)
     """
     helper = LayerHelper('group_points', **locals())
diff --git a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_farthest_point_sampling_op.py b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_farthest_point_sampling_op.py
index 76df7c77f100070a75f253cfe9ac005663ecb3d7..5cae1ec35564b28131b34a849efe419707b93697 100644
--- a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_farthest_point_sampling_op.py
+++ b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_farthest_point_sampling_op.py
@@ -44,8 +44,8 @@ class TestFarthestPointSamplingOp(unittest.TestCase):
         x_type = 'float32'
         sampled_point_num = 256
 
-        x = fluid.layers.data(
-            name='x', shape=x_shape, dtype=x_type, append_batch_size=False)
+        x = fluid.data(
+            name='x', shape=x_shape, dtype=x_type)
         y = pointnet_lib.farthest_point_sampling(x, sampled_point_num)
 
         x_np = np.random.randint(1, 100, (x_shape[0] * x_shape[1] *
diff --git a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_gather_point_op.py b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_gather_point_op.py
index ff01bc8ad70e1a20ff854b270fa4d4fc2c2f08e1..cc97a9e935c7a5a61ba7ab2aa874ea00d324a548 100644
--- a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_gather_point_op.py
+++ b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_gather_point_op.py
@@ -35,10 +35,10 @@ class TestGatherPointOp(unittest.TestCase):
         idx_shape = (1, 32)
         idx_type = 'int32'
 
-        x = fluid.layers.data(
-            name='x', shape=x_shape, dtype=x_type, append_batch_size=False)
-        idx = fluid.layers.data(
-            name='idx', shape=idx_shape, dtype=idx_type, append_batch_size=False)
+        x = fluid.data(
+            name='x', shape=x_shape, dtype=x_type)
+        idx = fluid.data(
+            name='idx', shape=idx_shape, dtype=idx_type)
         y = pointnet_lib.gather_point(x, idx)
 
         x_np = np.random.uniform(-10, 10, x_shape).astype(x_type)
diff --git a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_group_points_op.py b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_group_points_op.py
index 8ab4fb7a9c5040bf2c8130d1bd211243038c046f..6af446bd03ea8f9c148d87062f55f1f9cbd0c5f3 100644
--- a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_group_points_op.py
+++ b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_group_points_op.py
@@ -39,10 +39,10 @@ class TestGroupPointsOp(unittest.TestCase):
         idx_shape = [8, 37, 41]
         idx_type = 'int32'
 
-        x = fluid.layers.data(
-            name='x', shape=x_shape, dtype=x_type, append_batch_size=False)
-        idx = fluid.layers.data(
-            name='idx', shape=idx_shape, dtype=idx_type, append_batch_size=False)
+        x = fluid.data(
+            name='x', shape=x_shape, dtype=x_type)
+        idx = fluid.data(
+            name='idx', shape=idx_shape, dtype=idx_type)
         y = pointnet_lib.group_points(x, idx)
 
         x_np = np.random.uniform(-10, 10, x_shape).astype(x_type)
diff --git a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_query_ball_op.py b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_query_ball_op.py
index ab3ea1821f388108edf753420e90d37c8abfcbc0..a20068478677f7c9e25fbd1da7b10c5ba792f39d 100644
--- a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_query_ball_op.py
+++ b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_query_ball_op.py
@@ -48,10 +48,10 @@ class TestQueryBallOp(unittest.TestCase):
         radius = 6
         nsample = 5
 
-        points = fluid.layers.data(
-            name='points', shape=points_shape, dtype=points_type, append_batch_size=False)
-        new_points = fluid.layers.data(
-            name='new_points', shape=new_points_shape, dtype=points_type, append_batch_size=False)
+        points = fluid.data(
+            name='points', shape=points_shape, dtype=points_type)
+        new_points = fluid.data(
+            name='new_points', shape=new_points_shape, dtype=points_type)
         y = pointnet_lib.query_ball(points, new_points, radius, nsample)
 
         points_np = np.random.randint(1, 5, points_shape).astype(points_type)
diff --git a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_three_interp_op.py b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_three_interp_op.py
index e73fbad756ac5e5d3703f8354e2b0641b7cc9383..0264838fd7677b4b31958a7bfd8e2e07d0349903 100644
--- a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_three_interp_op.py
+++ b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_three_interp_op.py
@@ -42,12 +42,12 @@ class TestThreeInterpOp(unittest.TestCase):
         weight_shape = [8, 37, 3]
         weight_type = 'float32'
 
-        x = fluid.layers.data(
-            name='x', shape=input_shape, dtype=input_type, append_batch_size=False)
-        weight = fluid.layers.data(
-            name='weight', shape=weight_shape, dtype=weight_type, append_batch_size=False)
-        idx = fluid.layers.data(
-            name='idx', shape=weight_shape, dtype="int32", append_batch_size=False)
+        x = fluid.data(
+            name='x', shape=input_shape, dtype=input_type)
+        weight = fluid.data(
+            name='weight', shape=weight_shape, dtype=weight_type)
+        idx = fluid.data(
+            name='idx', shape=weight_shape, dtype="int32")
         y = pointnet_lib.three_interp(x, weight, idx)
 
         x_np = np.random.random(input_shape).astype(input_type)
diff --git a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_three_nn_op.py b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_three_nn_op.py
index c6468e8b8cf881e3bbd1a16b1f8a6896fce07333..d37a3257a2fc0ad8567400332a142a8981d0d5e3 100644
--- a/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_three_nn_op.py
+++ b/PaddleCV/3d_vision/PointNet++/ext_op/tests/test_three_nn_op.py
@@ -57,10 +57,10 @@ class TestThreeNNOp(unittest.TestCase):
         input_type = 'float32'
         eps = 1e-10
 
-        x = fluid.layers.data(
-            name='x', shape=input_shape, dtype=input_type, append_batch_size=False)
-        known = fluid.layers.data(
-            name='known', shape=known_shape, dtype=input_type, append_batch_size=False)
+        x = fluid.data(
+            name='x', shape=input_shape, dtype=input_type)
+        known = fluid.data(
+            name='known', shape=known_shape, dtype=input_type)
         dist, idx = pointnet_lib.three_nn(x, known, eps)
 
         x_np = np.random.random(input_shape).astype(input_type)
diff --git a/PaddleCV/3d_vision/PointRCNN/utils/proposal_utils.py b/PaddleCV/3d_vision/PointRCNN/utils/proposal_utils.py
index 9160ffe8e4e4a1aff7f8e8984e5ddd3711d1ffb0..16a3c2df4d1963bc6dd3e86b105df76067e98f13 100644
--- a/PaddleCV/3d_vision/PointRCNN/utils/proposal_utils.py
+++ b/PaddleCV/3d_vision/PointRCNN/utils/proposal_utils.py
@@ -257,7 +257,7 @@ if __name__ == "__main__":
     # cfg.RPN.NMS_TYPE = 'rotate'
     proposal_func = get_proposal_func(cfg)
 
-    x = fluid.layers.data(name="x", shape=[256, 84], dtype='float32')
+    x = fluid.data(name="x", shape=[None, 256, 84], dtype='float32')
     proposal = fluid.default_main_program().current_block().create_var(
                     name="proposal", dtype='float32', shape=[256, 7])
     fluid.layers.py_func(proposal_func, x, proposal)
diff --git a/PaddleCV/gan/c_gan/c_gan.py b/PaddleCV/gan/c_gan/c_gan.py
index 53a79e3b2d05991bc44a88272ecbd5c3a2b20c43..4ab1d59a9b48656716eb635af5552da6b4661225 100644
--- a/PaddleCV/gan/c_gan/c_gan.py
+++ b/PaddleCV/gan/c_gan/c_gan.py
@@ -61,18 +61,18 @@ def train(args):
     dg_program = fluid.Program()
 
     with fluid.program_guard(d_program):
-        conditions = fluid.layers.data(
-            name='conditions', shape=[1], dtype='float32')
-        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+        conditions = fluid.data(
+            name='conditions', shape=[None, 1], dtype='float32')
+        img = fluid.data(name='img', shape=[None, 784], dtype='float32')
+        label = fluid.data(name='label', shape=[None, 1], dtype='float32')
         d_logit = D_cond(img, conditions)
         d_loss = loss(d_logit, label)
 
     with fluid.program_guard(dg_program):
-        conditions = fluid.layers.data(
-            name='conditions', shape=[1], dtype='float32')
-        noise = fluid.layers.data(
-            name='noise', shape=[NOISE_SIZE], dtype='float32')
+        conditions = fluid.data(
+            name='conditions', shape=[None, 1], dtype='float32')
+        noise = fluid.data(
+            name='noise', shape=[None, NOISE_SIZE], dtype='float32')
         g_img = G_cond(z=noise, y=conditions)
 
         g_program = dg_program.clone()
diff --git a/PaddleCV/gan/c_gan/dc_gan.py b/PaddleCV/gan/c_gan/dc_gan.py
index 9ad42d4fe0b0a49c23af0d658962e2dadaa30aab..11520d48f2cd0b945e74bce61061724315d5647f 100644
--- a/PaddleCV/gan/c_gan/dc_gan.py
+++ b/PaddleCV/gan/c_gan/dc_gan.py
@@ -60,14 +60,14 @@ def train(args):
     dg_program = fluid.Program()
 
     with fluid.program_guard(d_program):
-        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+        img = fluid.data(name='img', shape=[None, 784], dtype='float32')
+        label = fluid.data(name='label', shape=[None, 1], dtype='float32')
         d_logit = D(img)
         d_loss = loss(d_logit, label)
 
     with fluid.program_guard(dg_program):
-        noise = fluid.layers.data(
-            name='noise', shape=[NOISE_SIZE], dtype='float32')
+        noise = fluid.data(
+            name='noise', shape=[None, NOISE_SIZE], dtype='float32')
         g_img = G(x=noise)
 
         g_program = dg_program.clone()
diff --git a/PaddleCV/gan/network/base_network.py b/PaddleCV/gan/network/base_network.py
index 2aba0eb737190c5f968ad429da3a58442a0106aa..f856401e5594f45b69e4b85054ab27dd8d85cc83 100644
--- a/PaddleCV/gan/network/base_network.py
+++ b/PaddleCV/gan/network/base_network.py
@@ -508,9 +508,9 @@ def conv2d_with_filter(input,
                     groups mismatch.
     Examples:
         .. code-block:: python
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], \
+          data = fluid.data(name='data', shape=[None, 3, 32, 32], \
                                   dtype='float32')
-          filter = fluid.layers.data(name='filter',shape=[10,3,3,3], \
+          filter = fluid.data(name='filter',shape=[None, 10, 3, 3, 3], \
                                     dtype='float32',append_batch_size=False)
           conv2d = fluid.layers.conv2d(input=data, 
                                        filter=filter,
diff --git a/PaddleCV/metric_learning/eval.py b/PaddleCV/metric_learning/eval.py
index a11e6bc19c5bae54d8a761b70e3bce7109c1fa6a..dd4aef51cff4035c51b05124156d3f1e2b74dc74 100644
--- a/PaddleCV/metric_learning/eval.py
+++ b/PaddleCV/metric_learning/eval.py
@@ -52,8 +52,9 @@ def eval(args):
     assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                      model_list)
 
-    image = fluid.layers.data(name='image', shape=[None] + image_shape, dtype='float32')
-    label = fluid.layers.data(name='label', shape=[None, 1], dtype='int64')
+    image = fluid.data(name='image', shape=[None] + image_shape, dtype='float32')
+    label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+
     test_loader = fluid.io.DataLoader.from_generator(
                 feed_list=[image, label],
                 capacity=64,
@@ -75,7 +76,7 @@ def eval(args):
         def if_exist(var):
             return os.path.exists(os.path.join(pretrained_model, var.name))
 
-        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+        fluid.load(program=test_program, model_path=pretrained_model, executor=exe)
 
     test_loader.set_sample_generator(
         reader.test(args),
diff --git a/PaddleCV/metric_learning/infer.py b/PaddleCV/metric_learning/infer.py
index 4710cfc89ff4b02fab1c31d2c7d7e13c0549cb2a..fa419435d2f80db182df866b13aceaf7147963e6 100644
--- a/PaddleCV/metric_learning/infer.py
+++ b/PaddleCV/metric_learning/infer.py
@@ -51,7 +51,7 @@ def infer(args):
     assert model_name in model_list, "{} is not in lists: {}".format(args.model,
                                                                      model_list)
 
-    image = fluid.layers.data(name='image', shape=[None] + image_shape, dtype='float32')
+    image = fluid.data(name='image', shape=[None] + image_shape, dtype='float32')
 
     infer_loader = fluid.io.DataLoader.from_generator(
                 feed_list=[image],
@@ -74,7 +74,7 @@ def infer(args):
         def if_exist(var):
             return os.path.exists(os.path.join(pretrained_model, var.name))
 
-        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
+        fluid.load(model_path=pretrained_model, program=test_program, executor=exe)
 
     infer_loader.set_sample_generator(
         reader.test(args),
diff --git a/PaddleCV/metric_learning/train_elem.py b/PaddleCV/metric_learning/train_elem.py
index a06a349012e049b3b7eceb1f1aabd32bcaca8224..6d13be5283640413c6c140b9be864e428b5033d6 100644
--- a/PaddleCV/metric_learning/train_elem.py
+++ b/PaddleCV/metric_learning/train_elem.py
@@ -108,9 +108,9 @@ def build_program(is_train, main_prog, startup_prog, args):
     model = models.__dict__[args.model]()
     with fluid.program_guard(main_prog, startup_prog):
         queue_capacity = 64
-        image = fluid.layers.data(
+        image = fluid.data(
                 name='image', shape=[None] + image_shape, dtype='float32')
-        label = fluid.layers.data(
+        label = fluid.data(
                 name='label', shape=[None, 1], dtype='int64')
         loader = fluid.io.DataLoader.from_generator(
                 feed_list=[image, label],
@@ -190,15 +190,14 @@ def train_async(args):
     logging.debug('after run startup program')
 
     if checkpoint is not None:
-        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)
+        fluid.load(program=train_prog, model_path=checkpoint, executor=exe)
 
     if pretrained_model:
 
         def if_exist(var):
             return os.path.exists(os.path.join(pretrained_model, var.name))
 
-        fluid.io.load_vars(
-            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
+        fluid.load(program=train_prog, model_path=pretrained_model, executor=exe)
 
     if args.use_gpu:
         devicenum = get_gpu_num()
@@ -287,7 +286,7 @@ def train_async(args):
                                           str(iter_no))
                 if not os.path.isdir(model_path):
                     os.makedirs(model_path)
-                fluid.io.save_persistables(exe, model_path, main_program=train_prog)
+                fluid.save(program=train_prog, model_path=model_path)
 
             iter_no += 1
 
diff --git a/PaddleCV/metric_learning/train_pair.py b/PaddleCV/metric_learning/train_pair.py
index 9bc2b5becde3d540b577f3170df273756e9e2293..aa31f8944cfb79ef91c3d9d48bc0c8b6c7567d58 100644
--- a/PaddleCV/metric_learning/train_pair.py
+++ b/PaddleCV/metric_learning/train_pair.py
@@ -115,9 +115,9 @@ def build_program(is_train, main_prog, startup_prog, args):
     model = models.__dict__[args.model]()
     with fluid.program_guard(main_prog, startup_prog):
         queue_capacity = 64
-        image = fluid.layers.data(
+        image = fluid.data(
                 name='image', shape=[None] + image_shape, dtype='float32')
-        label = fluid.layers.data(
+        label = fluid.data(
                 name='label', shape=[None, 1], dtype='int64')
         loader = fluid.io.DataLoader.from_generator(
                 feed_list=[image, label],
@@ -188,15 +188,15 @@ def train_async(args):
     logging.debug('after run startup program')
 
     if checkpoint is not None:
-        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)
+        fluid.load(program=train_prog, model_path=checkpoint, executor=exe)
 
     if pretrained_model:
 
         def if_exist(var):
             return os.path.exists(os.path.join(pretrained_model, var.name))
 
-        fluid.io.load_vars(
-            exe, pretrained_model, main_program=train_prog, predicate=if_exist)
+        fluid.load(program=train_prog, model_path=pretrained_model, executor=exe)
+
 
     if args.use_gpu:
         devicenum = get_gpu_num()
@@ -283,7 +283,7 @@ def train_async(args):
                                       str(iter_no))
                 if not os.path.isdir(model_path):
                     os.makedirs(model_path)
-                fluid.io.save_persistables(exe, model_path, main_program=train_prog)
+                fluid.save(program=train_prog, model_path=model_path)
 
             iter_no += 1
 
diff --git a/PaddleCV/ocr_recognition/README.md b/PaddleCV/ocr_recognition/README.md
index 70c5fae52c2a823b4e977bccdaf382cc75fcf0f5..7bb6519072dc228feb55ea422a4e51054c7cfe6b 100644
--- a/PaddleCV/ocr_recognition/README.md
+++ b/PaddleCV/ocr_recognition/README.md
@@ -1,4 +1,8 @@
 
+## 注意
+注意：该OCR库已迁移至新的github地址：https://github.com/PaddlePaddle/PaddleOCR
+该新库包含总模型仅8.6M的超轻量级中文OCR，单模型支持中英文数字组合识别、竖排文本识别、长文本识别。同时支持多种文本检测、文本识别的训练算法。欢迎大家去新的代码仓库中，查看与阅读更多关于OCR的详细介绍以及新功能。
+
 ## 代码结构
 ```
 ├── data_reader.py  # 下载、读取、处理数据。
diff --git a/PaddleCV/tracking/pytracking/libs/Fconv2d.py b/PaddleCV/tracking/pytracking/libs/Fconv2d.py
index 6d926b53934ac4b5b91f1d3859f00afffde8d7a2..22552e25589ce75a137c3a08f7f58d05d3c482cf 100644
--- a/PaddleCV/tracking/pytracking/libs/Fconv2d.py
+++ b/PaddleCV/tracking/pytracking/libs/Fconv2d.py
@@ -72,10 +72,10 @@ def Fconv2d(
                     groups mismatch.
     Examples:
         .. code-block:: python
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], \
+          data = fluid.data(name='data', shape=[None, 3, 32, 32], \
                                   dtype='float32')
-          filter = fluid.layers.data(name='filter',shape=[10,3,3,3], \
-                                    dtype='float32',append_batch_size=False)
+          filter = fluid.data(name='filter',shape=[10,3,3,3], \
+                                    dtype='float32')
           conv2d = fluid.layers.conv2d(input=data,
                                        filter=filter,
                                        act="relu")
diff --git a/PaddleCV/tracking/pytracking/libs/Fconv2d_static.py b/PaddleCV/tracking/pytracking/libs/Fconv2d_static.py
index d62edb7083d67559c7c6c0304be976a163a00117..ba6368739a4ac3ca163bdcabbab7d13fd5808b06 100644
--- a/PaddleCV/tracking/pytracking/libs/Fconv2d_static.py
+++ b/PaddleCV/tracking/pytracking/libs/Fconv2d_static.py
@@ -60,9 +60,9 @@ def Fconv2d(input,
                     groups mismatch.
     Examples:
         .. code-block:: python
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], \
+          data = fluid.data(name='data', shape=[3, 32, 32], \
                                   dtype='float32')
-          filter = fluid.layers.data(name='filter',shape=[10,3,3,3], \
+          filter = fluid.data(name='filter',shape=[10,3,3,3], \
                                     dtype='float32',append_batch_size=False)
           conv2d = fluid.layers.conv2d(input=data,
                                        filter=filter,
@@ -112,62 +112,4 @@ def Fconv2d(input,
     return pre_bias
 
 
-def test_conv2d_with_filter():
-    exemplar = np.random.random((8, 4, 6, 6)).astype(np.float32)
-    instance = np.random.random((8, 4, 22, 22)).astype(np.float32)
 
-    # fluid.layers.data(append_batch_size=)
-    use_gpu = False
-    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
-
-    train_program = fluid.Program()
-    start_program = fluid.Program()
-
-    with fluid.program_guard(train_program, start_program):
-        x = fluid.layers.data(
-            name="inst", shape=[8, 4, 22, 22], append_batch_size=False)
-        y = fluid.layers.data(
-            name="exem", shape=[8, 4, 6, 6], append_batch_size=False)
-        bias_att = fluid.ParamAttr(
-            name="bias_", initializer=fluid.initializer.ConstantInitializer(1.))
-        out = conv2d_with_filter(x, y, groups=1)
-        weight_att = fluid.ParamAttr(
-            name='weight',
-            initializer=fluid.initializer.NumpyArrayInitializer(exemplar))
-        bias_att = fluid.ParamAttr(
-            name="bias", initializer=fluid.initializer.ConstantInitializer(0.))
-        res = fluid.layers.conv2d(
-            x,
-            8,
-            6,
-            param_attr=weight_att,
-            bias_attr=bias_att,
-            stride=1,
-            padding=0,
-            dilation=1)
-
-        exe = fluid.Executor(place)
-        exe.run(program=fluid.default_startup_program())
-    print(out.shape)
-
-    compiled_prog = fluid.compiler.CompiledProgram(train_program)
-    out, res = exe.run(compiled_prog,
-                       feed={"inst": instance,
-                             "exem": exemplar},
-                       fetch_list=[out.name, res.name])
-
-    print(np.sum(out - res))
-    np.testing.assert_allclose(out, res, rtol=1e-5, atol=0)
-
-    with fluid.dygraph.guard():
-        exem = fluid.dygraph.to_variable(exemplar)
-        inst = fluid.dygraph.to_variable(instance)
-
-        out = conv2d_with_filter(inst, exem, groups=1)
-
-    print(np.sum(out.numpy() - res))
-    np.testing.assert_allclose(out.numpy(), res, rtol=1e-5, atol=0)
-
-
-if __name__ == '__main__':
-    test_conv2d_with_filter()
diff --git a/PaddleCV/tracking/pytracking/libs/optimization.py b/PaddleCV/tracking/pytracking/libs/optimization.py
index 41236ad8c9f26451903020da77a5b3910345cd62..3965fcfa2725338c8724f948221b098f9061ca69 100644
--- a/PaddleCV/tracking/pytracking/libs/optimization.py
+++ b/PaddleCV/tracking/pytracking/libs/optimization.py
@@ -3,7 +3,7 @@ from paddle.fluid import layers
 from paddle import fluid
 from pytracking.libs.tensorlist import TensorList
 from pytracking.utils.plotting import plot_graph
-from pytracking.libs.paddle_utils import n2p, clone, static_clone
+from pytracking.libs.paddle_utils import n2p, clone, static_clone, create_var_list
 
 
 class L2Problem:
@@ -243,20 +243,9 @@ class ConjugateGradient(ConjugateGradientBase):
         start_program = fluid.Program()
         with fluid.program_guard(train_program, start_program):
             scope = 'first/'
-            self.x_ph = TensorList([
-                fluid.layers.data(
-                    '{}x_{}'.format(scope, idx),
-                    v.shape,
-                    append_batch_size=False,
-                    stop_gradient=False) for idx, v in enumerate(self.x)
-            ])
-            self.p_ph = TensorList([
-                fluid.layers.data(
-                    '{}p_{}'.format(scope, idx),
-                    v.shape,
-                    append_batch_size=False,
-                    stop_gradient=False) for idx, v in enumerate(self.x)
-            ])
+            self.x_ph = TensorList(create_var_list(scope+"x", self.x, None))
+
+            self.p_ph = TensorList(create_var_list(scope+"p", self.x, None))
 
             # problem forward
             self.f0 = self.problem(self.x_ph, scope)
@@ -277,20 +266,10 @@ class ConjugateGradient(ConjugateGradientBase):
         start_program2 = fluid.Program()
         with fluid.program_guard(train_program2, start_program2):
             scope = 'second/'
-            self.x_ph_2 = TensorList([
-                fluid.layers.data(
-                    '{}x_{}'.format(scope, idx),
-                    v.shape,
-                    append_batch_size=False,
-                    stop_gradient=False) for idx, v in enumerate(self.x)
-            ])
-            self.dfdx_x_ph = TensorList([
-                fluid.layers.data(
-                    '{}dfdx_x_{}'.format(scope, idx),
-                    v.shape,
-                    append_batch_size=False,
-                    stop_gradient=False) for idx, v in enumerate(self.g)
-            ])
+
+            self.x_ph_2 = TensorList(create_var_list(scope+"x", self.x, None))
+
+            self.dfdx_x_ph = TensorList(create_var_list(scope+"dfdx_x", self.g, None))
 
             self.f0_2 = self.problem(self.x_ph_2, scope)
             self.dfdx_dfdx = TensorList(
@@ -444,20 +423,9 @@ class GaussNewtonCG(ConjugateGradientBase):
         start_program = fluid.Program()
         with fluid.program_guard(train_program, start_program):
             scope = 'first/'
-            self.x_ph = TensorList([
-                fluid.layers.data(
-                    '{}x_{}'.format(scope, idx),
-                    v.shape,
-                    append_batch_size=False,
-                    stop_gradient=False) for idx, v in enumerate(self.x)
-            ])
-            self.p_ph = TensorList([
-                fluid.layers.data(
-                    '{}p_{}'.format(scope, idx),
-                    v.shape,
-                    append_batch_size=False,
-                    stop_gradient=False) for idx, v in enumerate(self.x)
-            ])
+            self.x_ph = TensorList(create_var_list(scope+"x", self.x, None))
+
+            self.p_ph = TensorList(create_var_list(scope+"p", self.x, None))
 
             # problem forward
             self.f0 = self.problem(self.x_ph, scope)
@@ -477,20 +445,9 @@ class GaussNewtonCG(ConjugateGradientBase):
         start_program2 = fluid.Program()
         with fluid.program_guard(train_program2, start_program2):
             scope = 'second/'
-            self.x_ph_2 = TensorList([
-                fluid.layers.data(
-                    '{}x_{}'.format(scope, idx),
-                    v.shape,
-                    append_batch_size=False,
-                    stop_gradient=False) for idx, v in enumerate(self.x)
-            ])
-            self.dfdx_x_ph = TensorList([
-                fluid.layers.data(
-                    '{}dfdx_x_{}'.format(scope, idx),
-                    v.shape,
-                    append_batch_size=False,
-                    stop_gradient=False) for idx, v in enumerate(self.g)
-            ])
+            self.x_ph_2 = TensorList(create_var_list(scope+"x", self.x, None))
+
+            self.dfdx_x_ph = TensorList(create_var_list(scope+"dfdx_x", self.g, None))
 
             self.f0_2 = self.problem(self.x_ph_2, scope)
             self.dfdx_dfdx = TensorList(
@@ -654,13 +611,7 @@ class GradientDescentL2:
         train_program = fluid.Program()
         start_program = fluid.Program()
         with fluid.program_guard(train_program, start_program):
-            self.x_ph = TensorList([
-                fluid.layers.data(
-                    'x_{}'.format(idx),
-                    v.shape,
-                    append_batch_size=False,
-                    stop_gradient=False) for idx, v in enumerate(self.x)
-            ])
+            self.x_ph = TensorList(create_var_list("x", self.x, None))
 
             # problem forward
             self.f0 = self.problem(self.x_ph)
diff --git a/PaddleCV/tracking/pytracking/libs/paddle_utils.py b/PaddleCV/tracking/pytracking/libs/paddle_utils.py
index 79cf8b7d8ffc6f3975dee8378328891d10eca5ed..f23a71a8050fe4915264a6e7b88b6036d010a7a6 100644
--- a/PaddleCV/tracking/pytracking/libs/paddle_utils.py
+++ b/PaddleCV/tracking/pytracking/libs/paddle_utils.py
@@ -1,5 +1,6 @@
 import numpy as np
 import paddle
+import paddle.fluid as fluid
 from paddle.fluid import dygraph
 from paddle.fluid import layers
 from paddle.fluid.framework import Variable
@@ -216,3 +217,17 @@ def dropout2d(input, prob, is_train=False):
     binary_tensor = layers.floor(random_tensor)
     output = input / keep_prob * binary_tensor
     return output
+
+
+def create_var_list(scope, var_lists, shape):
+    vars = []
+    for idx, v in enumerate(var_lists):
+        name = "{}_{}".format(scope, idx)
+        if shape is None:
+            var = fluid.data(name, shape=v.shape)
+        else:
+            var = fluid.data(name, shape=shape + list(v[0].shape))
+        var.stop_gradient = False
+        vars.append(var)
+    return vars
+
diff --git a/PaddleCV/tracking/pytracking/tracker/atom/optim.py b/PaddleCV/tracking/pytracking/tracker/atom/optim.py
index 8c25c43d6123187baf9ffbffef5b75f2e016a4dc..ae0c4f29659b581d53fe5cdfe14c723df8108158 100644
--- a/PaddleCV/tracking/pytracking/tracker/atom/optim.py
+++ b/PaddleCV/tracking/pytracking/tracker/atom/optim.py
@@ -5,6 +5,7 @@ from paddle import fluid
 from pytracking.libs import optimization, TensorList, operation
 from pytracking.libs.paddle_utils import PTensor, broadcast_op, n2p, static_identity
 import math
+from pytracking.libs.paddle_utils import create_var_list
 
 
 def stack_input(e):
@@ -50,29 +51,18 @@ class FactorizedConvProblem(optimization.L2Problem):
 
     def get_inputs(self, scope=''):
         if scope not in self.inputs_dict:
-            training_samples_p = TensorList([
-                fluid.layers.data(
-                    '{}training_samples_{}'.format(scope, idx),
-                    shape=[None] + list(v[0].shape),
-                    stop_gradient=False,
-                    append_batch_size=False)
-                for idx, v in enumerate(self.training_samples)
-            ])
-            y_p = TensorList([
-                fluid.layers.data(
-                    '{}y_{}'.format(scope, idx),
-                    shape=[None] + list(v[0].shape),
-                    stop_gradient=False,
-                    append_batch_size=False) for idx, v in enumerate(self.y)
-            ])
-            sample_weights_p = TensorList([
-                fluid.layers.data(
-                    '{}sample_weights_{}'.format(scope, idx),
-                    shape=[None, 1],
-                    stop_gradient=False,
-                    append_batch_size=False)
-                for idx, v in enumerate(self.sample_weights)
-            ])
+            name = scope + "training_samples"
+            vars = create_var_list(name, self.sample_weights, [None])
+            training_samples_p = TensorList(vars)
+
+            name = scope + "y"
+            vars = create_var_list(name, self.y, [None])
+            y_p = TensorList(vars)
+
+            name = scope + "sample_weights"
+            vars = create_var_list(name, self.sample_weights, [None, 1])
+            sample_weights_p = TensorList(vars)
+
             self.inputs_dict[scope] = (training_samples_p, y_p,
                                        sample_weights_p)
 
@@ -189,29 +179,18 @@ class ConvProblem(optimization.L2Problem):
 
     def get_inputs(self, scope=''):
         if scope not in self.inputs_dict:
-            training_samples_p = TensorList([
-                fluid.layers.data(
-                    '{}training_samples_{}'.format(scope, idx),
-                    shape=[None] + list(v[0].shape),
-                    stop_gradient=False,
-                    append_batch_size=False)
-                for idx, v in enumerate(self.training_samples)
-            ])
-            y_p = TensorList([
-                fluid.layers.data(
-                    '{}y_{}'.format(scope, idx),
-                    shape=[None] + list(v[0].shape),
-                    stop_gradient=False,
-                    append_batch_size=False) for idx, v in enumerate(self.y)
-            ])
-            sample_weights_p = TensorList([
-                fluid.layers.data(
-                    '{}sample_weights_{}'.format(scope, idx),
-                    shape=[None] + list(v[0].shape),
-                    stop_gradient=False,
-                    append_batch_size=False)
-                for idx, v in enumerate(self.sample_weights)
-            ])
+            name = scope + "training_samples"
+            vars = create_var_list(name, self.training_samples, [None])
+            training_samples_p = TensorList(vars)
+
+            name = scope + "y"
+            vars = create_var_list(name, self.y, [None])
+            y_p = TensorList(vars)
+
+            name = scope + "sample_weights"
+            vars = create_var_list(name, self.sample_weights, [None])
+            sample_weights_p = TensorList(vars)
+
             self.inputs_dict[scope] = (training_samples_p, y_p,
                                        sample_weights_p)
 
diff --git a/PaddleCV/video/README.md b/PaddleCV/video/README.md
index e7d1785241d5f93d5302b074b99ec2e65e227e26..abeadebb3ad71797f53c1ccaffc38592d114e362 100644
--- a/PaddleCV/video/README.md
+++ b/PaddleCV/video/README.md
@@ -25,6 +25,13 @@
 
 - 提供了适合视频分类和动作定位任务的通用骨架代码，用户可一键式高效配置模型完成训练和评测。
 
+### 推荐用法
+
+- 视频分类共开源7个模型，可分为：端到端模型、序列模型。端到端模型：TSN推荐在时序不敏感视频场景（比如互联网视频场景）使用；TSM、StNet推荐在时序敏感视频场景（比如Kinetics数据集）使用；Non-local模型计算量较大，在科研场景推荐。序列模型：Attention LSTM，Attention Cluster和NeXtVLAD 整体性能接近，但是网络结构不同，推荐集成多个模型使用。
+
+- 视频动作定位共开源3个模型，视频动作定位推荐使用CTCN模型，时序提名生成推荐使用BMN模型。
+
+
 ## 安装
 
 在当前模型库运行样例代码需要PaddlePaddle Fluid v.1.6.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本，请根据[安装文档](http://www.paddlepaddle.org/documentation/docs/zh/1.6/beginners_guide/install/index_cn.html)中的说明来更新PaddlePaddle。
diff --git a/PaddleCV/video/metrics/youtube8m/eval_util.py b/PaddleCV/video/metrics/youtube8m/eval_util.py
index f7742236f1176073eae84fdc7c3a3a1a2e294fe0..67bc2e54d14df350ab659ebaedeec9dd9dd176bc 100644
--- a/PaddleCV/video/metrics/youtube8m/eval_util.py
+++ b/PaddleCV/video/metrics/youtube8m/eval_util.py
@@ -37,7 +37,7 @@ def calculate_hit_at_one(predictions, actuals):
     float: The average hit at one across the entire batch.
   """
     top_prediction = numpy.argmax(predictions, 1)
-    hits = actuals[numpy.arange(actuals.shape[0]), top_prediction]
+    hits = actuals[:, top_prediction]
     return numpy.average(hits)
 
 
diff --git a/PaddleCV/video/models/attention_cluster/shifting_attention.py b/PaddleCV/video/models/attention_cluster/shifting_attention.py
index e27ad8dd58b882eb96fbb9763eecccc36ddfe28a..e39dddee8e663c35386c017c3937df4da2e2b136 100644
--- a/PaddleCV/video/models/attention_cluster/shifting_attention.py
+++ b/PaddleCV/video/models/attention_cluster/shifting_attention.py
@@ -32,8 +32,7 @@ class ShiftingAttentionModel(object):
         x_shape.stop_gradient = True
         flat_x = fluid.layers.reshape(x, shape=(-1, self.seg_num))
         flat_softmax = fluid.layers.softmax(flat_x)
-        return fluid.layers.reshape(
-            flat_softmax, shape=x.shape, actual_shape=x_shape)
+        return fluid.layers.reshape(flat_softmax, shape=x_shape)
 
     def glorot(self, n):
         return np.sqrt(1.0 / np.sqrt(n))
diff --git a/PaddleCV/video/reader/bmn_reader.py b/PaddleCV/video/reader/bmn_reader.py
index 7a3bc944253d54f473d5efec43ae5a0771392741..eb5c0b2ef91b67e4e6c438ee08e80f2c3ca95805 100644
--- a/PaddleCV/video/reader/bmn_reader.py
+++ b/PaddleCV/video/reader/bmn_reader.py
@@ -21,6 +21,7 @@ import json
 import logging
 import functools
 import paddle
+import paddle.fluid as fluid
 
 logger = logging.getLogger(__name__)
 
@@ -228,8 +229,8 @@ class BMNReader(DataReader):
         mapper = functools.partial(process_data, mode=self.mode)
 
         def batch_reader():
-            xreader = paddle.reader.xmap_readers(mapper, reader,
-                                                 self.num_threads, 1024)
+            xreader = fluid.io.xmap_readers(mapper, reader, self.num_threads,
+                                            1024)
             batch = []
             for item in xreader():
                 batch.append(item)
diff --git a/PaddleCV/video/reader/bsn_reader.py b/PaddleCV/video/reader/bsn_reader.py
index a94074b53473cc4b801cc4772133ff27bf6f9b15..ae4c6b72c01d17d1836f33248598fb110e26113e 100644
--- a/PaddleCV/video/reader/bsn_reader.py
+++ b/PaddleCV/video/reader/bsn_reader.py
@@ -22,6 +22,7 @@ import json
 import logging
 import functools
 import paddle
+import paddle.fluid as fluid
 logger = logging.getLogger(__name__)
 
 from .reader_utils import DataReader
@@ -214,8 +215,8 @@ class BSNVideoReader(DataReader):
         mapper = functools.partial(process_data, mode=self.mode)
 
         def batch_reader():
-            xreader = paddle.reader.xmap_readers(mapper, reader,
-                                                 self.num_threads, 1024)
+            xreader = fluid.io.xmap_readers(mapper, reader, self.num_threads,
+                                            1024)
             batch = []
             for item in xreader():
                 batch.append(item)
@@ -444,8 +445,8 @@ class BSNProposalReader(DataReader):
         mapper = functools.partial(process_data, mode=self.mode)
 
         def batch_reader():
-            xreader = paddle.reader.xmap_readers(mapper, reader,
-                                                 self.num_threads, 1024)
+            xreader = fluid.io.xmap_readers(mapper, reader, self.num_threads,
+                                            1024)
             batch = []
             for item in xreader():
                 batch.append(item)
diff --git a/PaddleCV/video/reader/ets_reader.py b/PaddleCV/video/reader/ets_reader.py
index 745887eedf734308d5397722774f8cdcd711064b..a53250116ea344eaca9ef1333f05f1394a839a46 100644
--- a/PaddleCV/video/reader/ets_reader.py
+++ b/PaddleCV/video/reader/ets_reader.py
@@ -18,6 +18,7 @@ import sys
 import numpy as np
 import functools
 import paddle
+import paddle.fluid as fluid
 
 import logging
 logger = logging.getLogger(__name__)
@@ -154,8 +155,8 @@ class ETSReader(DataReader):
 
             mapper = functools.partial(process_data)
 
-            return paddle.reader.xmap_readers(mapper, reader, self.num_threads,
-                                              self.buffer_size)
+            return fluid.io.xmap_readers(mapper, reader, self.num_threads,
+                                         self.buffer_size)
 
         def batch_reader():
             batch_out = []
diff --git a/PaddleCV/video/reader/kinetics_reader.py b/PaddleCV/video/reader/kinetics_reader.py
index 2960096948730deda6ba986081d5b74b692904d2..55a70a1d812e5be4641de184899df95d8820ec0f 100644
--- a/PaddleCV/video/reader/kinetics_reader.py
+++ b/PaddleCV/video/reader/kinetics_reader.py
@@ -26,7 +26,7 @@ except ImportError:
     from io import BytesIO
 import numpy as np
 import paddle
-
+import paddle.fluid as fluid
 try:
     from nvidia.dali.pipeline import Pipeline
     import nvidia.dali.ops as ops
@@ -34,6 +34,7 @@ try:
     import tempfile
     from nvidia.dali.plugin.paddle import DALIGenericIterator
 except:
+    Pipeline = object
     print("DALI is not installed, you can improve performance if use DALI")
 
 from PIL import Image, ImageEnhance
@@ -272,8 +273,7 @@ class KineticsReader(DataReader):
             img_mean=img_mean,
             img_std=img_std)
 
-        return paddle.reader.xmap_readers(mapper, reader_, num_threads,
-                                          buf_size)
+        return fluid.io.xmap_readers(mapper, reader_, num_threads, buf_size)
 
     def build_dali_reader(self):
         """
diff --git a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade/utils/save_load_io.py b/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade/utils/save_load_io.py
deleted file mode 100644
index bdcdd811dd1ebb27542e5facc3ba050f00df08f8..0000000000000000000000000000000000000000
--- a/PaddleNLP/dialogue_system/auto_dialogue_evaluation/ade/utils/save_load_io.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""save or load model api"""
-
-import os
-import sys
-
-import paddle
-import paddle.fluid as fluid
-
-
-def init_from_pretrain_model(args, exe, program):
-
-    assert isinstance(args.init_from_pretrain_model, str)
-
-    if not os.path.exists(args.init_from_pretrain_model):
-        raise Warning("The pretrained params do not exist.")
-        return False
-
-    def existed_params(var):
-        if not isinstance(var, fluid.framework.Parameter):
-            return False
-        return os.path.exists(
-            os.path.join(args.init_from_pretrain_model, var.name))
-
-    fluid.io.load_vars(
-        exe,
-        args.init_from_pretrain_model,
-        main_program=program,
-        predicate=existed_params)
-
-    print("finish initing model from pretrained params from %s" %
-          (args.init_from_pretrain_model))
-
-    return True
-
-
-def init_from_checkpoint(args, exe, program):
-
-    assert isinstance(args.init_from_checkpoint, str)
-
-    if not os.path.exists(args.init_from_checkpoint):
-        raise Warning("the checkpoint path does not exist.")
-        return False
-
-    fluid.io.load_persistables(
-        executor=exe,
-        dirname=args.init_from_checkpoint,
-        main_program=program,
-        filename="checkpoint.pdckpt")
-
-    print("finish initing model from checkpoint from %s" %
-          (args.init_from_checkpoint))
-
-    return True
-
-
-def init_from_params(args, exe, program):
-
-    assert isinstance(args.init_from_params, str)
-
-    if not os.path.exists(args.init_from_params):
-        raise Warning("the params path does not exist.")
-        return False
-
-    fluid.io.load_params(
-        executor=exe,
-        dirname=args.init_from_params,
-        main_program=program,
-        filename="params.pdparams")
-
-    print("finish init model from params from %s" % (args.init_from_params))
-
-    return True
-
-
-def save_checkpoint(args, exe, program, dirname):
-
-    assert isinstance(args.save_model_path, str)
-
-    checkpoint_dir = os.path.join(args.save_model_path, args.save_checkpoint)
-
-    if not os.path.exists(checkpoint_dir):
-        os.mkdir(checkpoint_dir)
-
-    fluid.io.save_persistables(
-        exe,
-        os.path.join(checkpoint_dir, dirname),
-        main_program=program,
-        filename="checkpoint.pdckpt")
-
-    print("save checkpoint at %s" % (os.path.join(checkpoint_dir, dirname)))
-
-    return True
-
-
-def save_param(args, exe, program, dirname):
-
-    assert isinstance(args.save_model_path, str)
-
-    param_dir = os.path.join(args.save_model_path, args.save_param)
-
-    if not os.path.exists(param_dir):
-        os.makedirs(param_dir)
-
-    fluid.io.save_params(
-        exe,
-        os.path.join(param_dir, dirname),
-        main_program=program,
-        filename="params.pdparams")
-    print("save parameters at %s" % (os.path.join(param_dir, dirname)))
-
-    return True
diff --git a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/utils/save_load_io.py b/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/utils/save_load_io.py
deleted file mode 100644
index bdcdd811dd1ebb27542e5facc3ba050f00df08f8..0000000000000000000000000000000000000000
--- a/PaddleNLP/dialogue_system/dialogue_general_understanding/dgu/utils/save_load_io.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""save or load model api"""
-
-import os
-import sys
-
-import paddle
-import paddle.fluid as fluid
-
-
-def init_from_pretrain_model(args, exe, program):
-
-    assert isinstance(args.init_from_pretrain_model, str)
-
-    if not os.path.exists(args.init_from_pretrain_model):
-        raise Warning("The pretrained params do not exist.")
-        return False
-
-    def existed_params(var):
-        if not isinstance(var, fluid.framework.Parameter):
-            return False
-        return os.path.exists(
-            os.path.join(args.init_from_pretrain_model, var.name))
-
-    fluid.io.load_vars(
-        exe,
-        args.init_from_pretrain_model,
-        main_program=program,
-        predicate=existed_params)
-
-    print("finish initing model from pretrained params from %s" %
-          (args.init_from_pretrain_model))
-
-    return True
-
-
-def init_from_checkpoint(args, exe, program):
-
-    assert isinstance(args.init_from_checkpoint, str)
-
-    if not os.path.exists(args.init_from_checkpoint):
-        raise Warning("the checkpoint path does not exist.")
-        return False
-
-    fluid.io.load_persistables(
-        executor=exe,
-        dirname=args.init_from_checkpoint,
-        main_program=program,
-        filename="checkpoint.pdckpt")
-
-    print("finish initing model from checkpoint from %s" %
-          (args.init_from_checkpoint))
-
-    return True
-
-
-def init_from_params(args, exe, program):
-
-    assert isinstance(args.init_from_params, str)
-
-    if not os.path.exists(args.init_from_params):
-        raise Warning("the params path does not exist.")
-        return False
-
-    fluid.io.load_params(
-        executor=exe,
-        dirname=args.init_from_params,
-        main_program=program,
-        filename="params.pdparams")
-
-    print("finish init model from params from %s" % (args.init_from_params))
-
-    return True
-
-
-def save_checkpoint(args, exe, program, dirname):
-
-    assert isinstance(args.save_model_path, str)
-
-    checkpoint_dir = os.path.join(args.save_model_path, args.save_checkpoint)
-
-    if not os.path.exists(checkpoint_dir):
-        os.mkdir(checkpoint_dir)
-
-    fluid.io.save_persistables(
-        exe,
-        os.path.join(checkpoint_dir, dirname),
-        main_program=program,
-        filename="checkpoint.pdckpt")
-
-    print("save checkpoint at %s" % (os.path.join(checkpoint_dir, dirname)))
-
-    return True
-
-
-def save_param(args, exe, program, dirname):
-
-    assert isinstance(args.save_model_path, str)
-
-    param_dir = os.path.join(args.save_model_path, args.save_param)
-
-    if not os.path.exists(param_dir):
-        os.makedirs(param_dir)
-
-    fluid.io.save_params(
-        exe,
-        os.path.join(param_dir, dirname),
-        main_program=program,
-        filename="params.pdparams")
-    print("save parameters at %s" % (os.path.join(param_dir, dirname)))
-
-    return True
diff --git a/PaddleNLP/lexical_analysis/creator.py b/PaddleNLP/lexical_analysis/creator.py
index bf02b57ae219512c97c3a428400ba99e750c7ce0..bc8e492f1b66e4474b626aa3dbfa767889f73e1d 100644
--- a/PaddleNLP/lexical_analysis/creator.py
+++ b/PaddleNLP/lexical_analysis/creator.py
@@ -105,15 +105,15 @@ def create_pyreader(args,
         # create lac pyreader
         if mode == 'train':
             pyreader.set_sample_list_generator(
-                paddle.batch(
-                    paddle.reader.shuffle(
+                fluid.io.batch(
+                    fluid.io.shuffle(
                         reader.file_reader(file_name),
                         buf_size=args.traindata_shuffle_buffer),
                     batch_size=args.batch_size / device_count),
                 places=place)
         else:
             pyreader.set_sample_list_generator(
-                paddle.batch(
+                fluid.io.batch(
                     reader.file_reader(
                         file_name, mode=mode),
                     batch_size=args.batch_size / device_count),
diff --git a/PaddleNLP/machine_translation/transformer/README.md b/PaddleNLP/machine_translation/transformer/README.md
index ee945ea05db9b9e52aac9a40dee0ab48fceb7318..9bc8462bf0f6f3203beda0d34d9c6021f63a3ef5 100644
--- a/PaddleNLP/machine_translation/transformer/README.md
+++ b/PaddleNLP/machine_translation/transformer/README.md
@@ -32,7 +32,7 @@
 
 1. paddle安装
 
-   本项目依赖于 PaddlePaddle 1.6及以上版本或适当的develop版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
+   本项目依赖于 PaddlePaddle 1.8及以上版本或适当的develop版本，请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装
 
 2. 下载代码
 
@@ -44,7 +44,7 @@
 
 3. 环境依赖
 
-   请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.5/beginners_guide/install/index_cn.html)部分的内容
+   请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)部分的内容
 
 
 ### 数据准备
diff --git a/PaddleNLP/machine_translation/transformer/transformer.py b/PaddleNLP/machine_translation/transformer/transformer.py
index a73ce9f3006a30c0183cb4a71c96ce36c9677b07..66901bb1cf40d1fb09e7a51dc1b6d94ef00559c9 100644
--- a/PaddleNLP/machine_translation/transformer/transformer.py
+++ b/PaddleNLP/machine_translation/transformer/transformer.py
@@ -752,18 +752,17 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
         # caches contains states of history steps in decoder self-attention
         # and static encoder output projections in encoder-decoder attention
         # to reduce redundant computation.
+        batch_size = layers.shape(start_tokens)[0]
         caches = [
             {
                 "k":  # for self attention
-                layers.fill_constant_batch_size_like(
-                    input=start_tokens,
-                    shape=[-1, n_head, 0, d_key],
+                layers.fill_constant(
+                    shape=[batch_size, n_head, 0, d_key],
                     dtype=enc_output.dtype,
                     value=0),
                 "v":  # for self attention
-                layers.fill_constant_batch_size_like(
-                    input=start_tokens,
-                    shape=[-1, n_head, 0, d_value],
+                layers.fill_constant(
+                    shape=[batch_size, n_head, 0, d_value],
                     dtype=enc_output.dtype,
                     value=0),
                 "static_k":  # for encoder-decoder attention
@@ -792,12 +791,10 @@ def fast_decode(model_input, src_vocab_size, trg_vocab_size, max_in_len,
                 lambda x: layers.gather(x, index=gather_idx), caches)
             pre_src_attn_bias = layers.gather(
                 trg_src_attn_bias, index=gather_idx)
+            bias_batch_size = layers.shape(pre_src_attn_bias)[0]
             pre_pos = layers.elementwise_mul(
-                x=layers.fill_constant_batch_size_like(
-                    input=pre_src_attn_bias,  # cann't use lod tensor here
-                    value=1,
-                    shape=[-1, 1],
-                    dtype=pre_ids.dtype),
+                x=layers.fill_constant(
+                    value=1, shape=[bias_batch_size, 1], dtype=pre_ids.dtype),
                 y=step_idx,
                 axis=0)
             logits = wrap_decoder(
diff --git a/PaddleNLP/shared_modules/models/matching/paddle_layers.py b/PaddleNLP/shared_modules/models/matching/paddle_layers.py
index f8782a93860630f26f94d03afbd471e594b8ac9d..c3650f5c7054dba19fb9849decab4e06a91924e9 100644
--- a/PaddleNLP/shared_modules/models/matching/paddle_layers.py
+++ b/PaddleNLP/shared_modules/models/matching/paddle_layers.py
@@ -210,7 +210,7 @@ class DataLayer(object):
         """
         operation
         """
-        data = fluid.layers.data(
+        data = fluid.data(
             name=name, shape=shape, dtype=dtype, lod_level=lod_level)
         return data
 
@@ -383,8 +383,10 @@ class ConstantLayer(object):
         """
         operation
         """
-        constant = fluid.layers.fill_constant_batch_size_like(input, shape,
-                                                              dtype, value)
+        shape = list(shape)
+        input_shape = fluid.layers.shape(input)
+        shape[0] = input_shape[0]
+        constant = fluid.layers.fill_constant(shape, dtype, value)
         return constant
 
 
diff --git a/PaddleNLP/similarity_net/README.md b/PaddleNLP/similarity_net/README.md
index 61e8ce603eba123c147dbcad05de017ef1e18945..90208707cd051ab418a8b7f9f3e534e09b1a6b05 100644
--- a/PaddleNLP/similarity_net/README.md
+++ b/PaddleNLP/similarity_net/README.md
@@ -22,7 +22,7 @@
 |UNICOM|联通客服|客服|
 ## 快速开始
 #### 版本依赖
-本项目依赖于 Paddlepaddle Fluid 1.6，请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
+本项目依赖于 Paddlepaddle Fluid 1.8，请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick)进行安装。
 
 python版本依赖python 2.7
 #### 安装代码
diff --git a/PaddleNLP/similarity_net/run_classifier.py b/PaddleNLP/similarity_net/run_classifier.py
index 77271c463dbe9310164b1590436fd154cf726a62..4c25cf2c5a4bd2fa10ed909dd205964fd6585b62 100644
--- a/PaddleNLP/similarity_net/run_classifier.py
+++ b/PaddleNLP/similarity_net/run_classifier.py
@@ -47,46 +47,51 @@ from models.model_check import check_version
 from models.model_check import check_cuda
 
 
-def create_model(args, pyreader_name, is_inference=False, is_pointwise=False):
+def create_model(args, is_inference=False, is_pointwise=False):
     """
     Create Model for simnet
     """
     if is_inference:
-        inf_pyreader = fluid.layers.py_reader(
-        capacity=16,
-        shapes=([-1], [-1]),
-        dtypes=('int64', 'int64'),
-        lod_levels=(1, 1),
-        name=pyreader_name,
-        use_double_buffer=False)
+        left = fluid.data(name='left', shape=[None], dtype='int64', lod_level=1)
+        pos_right = fluid.data(
+            name='pos_right', shape=[None], dtype='int64', lod_level=1)
+        inf_loader = fluid.io.DataLoader.from_generator(
+            capacity=16,
+            feed_list=[left, pos_right],
+            iterable=False,
+            use_double_buffer=False)
 
-        left, pos_right = fluid.layers.read_file(inf_pyreader)
-        return inf_pyreader, left, pos_right
+        return inf_loader, left, pos_right
 
     else:
         if is_pointwise:
-            pointwise_pyreader = fluid.layers.py_reader(
-            capacity=16,
-            shapes=([-1], [-1], [-1]),
-            dtypes=('int64', 'int64', 'int64'),
-            lod_levels=(1, 1, 0),
-            name=pyreader_name,
-            use_double_buffer=False)
-
-            left, right, label = fluid.layers.read_file(pointwise_pyreader)
-            return pointwise_pyreader, left, right, label
+            left = fluid.data(
+                name='left', shape=[None], dtype='int64', lod_level=1)
+            right = fluid.data(
+                name='right', shape=[None], dtype='int64', lod_level=1)
+            label = fluid.data(name='label', shape=[None], dtype='int64')
+            pointwise_loader = fluid.io.DataLoader.from_generator(
+                capacity=16,
+                feed_list=[left, right, label],
+                iterable=False,
+                use_double_buffer=False)
+
+            return pointwise_loader, left, right, label
 
         else:
-            pairwise_pyreader = fluid.layers.py_reader(
-            capacity=16,
-            shapes=([-1], [-1], [-1]),
-            dtypes=('int64', 'int64', 'int64'),
-            lod_levels=(1, 1, 1),
-            name=pyreader_name,
-            use_double_buffer=False)
+            left = fluid.data(
+                name='left', shape=[None], dtype='int64', lod_level=1)
+            pos_right = fluid.data(
+                name='pos_right', shape=[None], dtype='int64', lod_level=1)
+            neg_right = fluid.data(
+                name='neg_right', shape=[None], dtype='int64', lod_level=1)
+            pairwise_loader = fluid.io.DataLoader.from_generator(
+                capacity=16,
+                feed_list=[left, pos_right, neg_right],
+                iterable=False,
+                use_double_buffer=False)
 
-            left, pos_right, neg_right = fluid.layers.read_file(pairwise_pyreader)
-            return pairwise_pyreader, left, pos_right, neg_right
+            return pairwise_loader, left, pos_right, neg_right
 
 
 def train(conf_dict, args):
@@ -131,8 +136,7 @@ def train(conf_dict, args):
         # Build network
         with fluid.program_guard(train_program, startup_prog):
             with fluid.unique_name.guard():
-                train_pyreader, left, pos_right, neg_right = create_model(
-                    args, pyreader_name='train_reader')
+                train_loader, left, pos_right, neg_right = create_model(args)
                 left_feat, pos_score = net.predict(left, pos_right)
                 pred = pos_score
                 _, neg_score = net.predict(left, neg_right)
@@ -147,8 +151,8 @@ def train(conf_dict, args):
             test_prog = fluid.Program()
             with fluid.program_guard(test_prog, startup_prog):
                 with fluid.unique_name.guard():
-                    test_pyreader, left, pos_right = create_model(
-                        args, pyreader_name='test_reader', is_inference=True)
+                    test_loader, left, pos_right = create_model(
+                        args, is_inference=True)
                     left_feat, pos_score = net.predict(left, pos_right)
                     pred = pos_score
             test_prog = test_prog.clone(for_test=True)
@@ -157,8 +161,8 @@ def train(conf_dict, args):
         # Build network
         with fluid.program_guard(train_program, startup_prog):
             with fluid.unique_name.guard():
-                train_pyreader, left, right, label = create_model(
-                    args, pyreader_name='train_reader', is_pointwise=True)
+                train_loader, left, right, label = create_model(
+                    args, is_pointwise=True)
                 left_feat, pred = net.predict(left, right)
                 avg_cost = loss.compute(pred, label)
                 avg_cost.persistable = True
@@ -171,15 +175,15 @@ def train(conf_dict, args):
             test_prog = fluid.Program()
             with fluid.program_guard(test_prog, startup_prog):
                 with fluid.unique_name.guard():
-                    test_pyreader, left, right = create_model(
-                        args, pyreader_name='test_reader', is_inference=True)
+                    test_loader, left, right = create_model(
+                        args, is_inference=True)
                     left_feat, pred = net.predict(left, right)
             test_prog = test_prog.clone(for_test=True)
 
     if args.init_checkpoint is not "":
         utils.init_checkpoint(exe, args.init_checkpoint, startup_prog)
 
-    def valid_and_test(test_program, test_pyreader, get_valid_examples, process,
+    def valid_and_test(test_program, test_loader, get_valid_examples, process,
                        mode, exe, fetch_list):
         """
         return auc and acc
@@ -187,15 +191,15 @@ def train(conf_dict, args):
         # Get Batch Data
         batch_data = fluid.io.batch(
             get_valid_examples, args.batch_size, drop_last=False)
-        test_pyreader.decorate_paddle_reader(batch_data)
-        test_pyreader.start()
+        test_loader.set_sample_list_generator(batch_data)
+        test_loader.start()
         pred_list = []
         while True:
             try:
                 _pred = exe.run(program=test_program, fetch_list=[pred.name])
                 pred_list += list(_pred)
             except fluid.core.EOFException:
-                test_pyreader.reset()
+                test_loader.reset()
                 break
         pred_list = np.vstack(pred_list)
         if mode == "test":
@@ -233,8 +237,8 @@ def train(conf_dict, args):
                 get_train_examples, buf_size=10000),
             args.batch_size,
             drop_last=False)
-    train_pyreader.decorate_paddle_reader(train_batch_data)
-    train_pyreader.start()
+    train_loader.set_sample_list_generator(train_batch_data)
+    train_loader.start()
     exe.run(startup_prog)
     losses = []
     start_time = time.time()
@@ -248,8 +252,8 @@ def train(conf_dict, args):
             if args.do_valid and global_step % args.validation_steps == 0:
                 get_valid_examples = simnet_process.get_reader("valid")
                 valid_result = valid_and_test(
-                    test_prog, test_pyreader, get_valid_examples,
-                    simnet_process, "valid", exe, [pred.name])
+                    test_prog, test_loader, get_valid_examples, simnet_process,
+                    "valid", exe, [pred.name])
                 if args.compute_accuracy:
                     valid_auc, valid_acc = valid_result
                     logging.info(
@@ -281,7 +285,7 @@ def train(conf_dict, args):
                 logging.info("saving infer model in %s" % model_path)
 
         except fluid.core.EOFException:
-            train_pyreader.reset()
+            train_loader.reset()
             break
     end_time = time.time()
     #logging.info("epoch: %d, loss: %f, used time: %d sec" %
@@ -327,9 +331,8 @@ def train(conf_dict, args):
         else:
             # Get Feeder and Reader
             get_test_examples = simnet_process.get_reader("test")
-        test_result = valid_and_test(test_prog, test_pyreader,
-                                     get_test_examples, simnet_process, "test",
-                                     exe, [pred.name])
+        test_result = valid_and_test(test_prog, test_loader, get_test_examples,
+                                     simnet_process, "test", exe, [pred.name])
         if args.compute_accuracy:
             test_auc, test_acc = test_result
             logging.info("AUC of test is %f, Accuracy of test is %f" %
@@ -371,8 +374,8 @@ def test(conf_dict, args):
         if args.task_mode == "pairwise":
             with fluid.program_guard(test_prog, startup_prog):
                 with fluid.unique_name.guard():
-                    test_pyreader, left, pos_right = create_model(
-                        args, pyreader_name='test_reader', is_inference=True)
+                    test_loader, left, pos_right = create_model(
+                        args, is_inference=True)
                     left_feat, pos_score = net.predict(left, pos_right)
                     pred = pos_score
             test_prog = test_prog.clone(for_test=True)
@@ -380,8 +383,8 @@ def test(conf_dict, args):
         else:
             with fluid.program_guard(test_prog, startup_prog):
                 with fluid.unique_name.guard():
-                    test_pyreader, left, right = create_model(
-                        args, pyreader_name='test_reader', is_inference=True)
+                    test_loader, left, right = create_model(
+                        args, is_inference=True)
                     left_feat, pred = net.predict(left, right)
             test_prog = test_prog.clone(for_test=True)
 
@@ -390,10 +393,10 @@ def test(conf_dict, args):
         utils.init_checkpoint(exe, args.init_checkpoint, main_program=test_prog)
 
         test_exe = exe
-        test_pyreader.decorate_paddle_reader(batch_data)
+        test_loader.set_sample_list_generator(batch_data)
 
         logging.info("start test process ...")
-        test_pyreader.start()
+        test_loader.start()
         pred_list = []
         fetch_list = [pred.name]
         output = []
@@ -412,7 +415,7 @@ def test(conf_dict, args):
                         map(lambda item: str(np.argmax(item)), output[0])) +
                                            "\n")
             except fluid.core.EOFException:
-                test_pyreader.reset()
+                test_loader.reset()
                 break
         if args.task_mode == "pairwise":
             pred_list = np.array(pred_list).reshape((-1, 1))
@@ -468,16 +471,16 @@ def infer(conf_dict, args):
     if args.task_mode == "pairwise":
         with fluid.program_guard(test_prog, startup_prog):
             with fluid.unique_name.guard():
-                infer_pyreader, left, pos_right = create_model(
-                    args, pyreader_name='infer_reader', is_inference=True)
+                infer_loader, left, pos_right = create_model(
+                    args, is_inference=True)
                 left_feat, pos_score = net.predict(left, pos_right)
                 pred = pos_score
         test_prog = test_prog.clone(for_test=True)
     else:
         with fluid.program_guard(test_prog, startup_prog):
             with fluid.unique_name.guard():
-                infer_pyreader, left, right = create_model(
-                    args, pyreader_name='infer_reader', is_inference=True)
+                infer_loader, left, right = create_model(
+                    args, is_inference=True)
                 left_feat, pred = net.predict(left, right)
         test_prog = test_prog.clone(for_test=True)
 
@@ -486,13 +489,13 @@ def infer(conf_dict, args):
     utils.init_checkpoint(exe, args.init_checkpoint, main_program=test_prog)
 
     test_exe = exe
-    infer_pyreader.decorate_sample_list_generator(batch_data)
+    infer_loader.set_sample_list_generator(batch_data)
 
     logging.info("start test process ...")
     preds_list = []
     fetch_list = [pred.name]
     output = []
-    infer_pyreader.start()
+    infer_loader.start()
     while True:
         try:
             output = test_exe.run(program=test_prog, fetch_list=fetch_list)
@@ -502,7 +505,7 @@ def infer(conf_dict, args):
             else:
                 preds_list += map(lambda item: str(np.argmax(item)), output[0])
         except fluid.core.EOFException:
-            infer_pyreader.reset()
+            infer_loader.reset()
             break
     with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file:
         for _data, _pred in zip(simnet_process.get_infer_data(), preds_list):
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/README.md b/PaddleRec/ctr/Paddle_baseline_KDD2019/README.md
deleted file mode 100644
index 4555bb2a87c39e89fb922522e8368f4a3acccf3b..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/README.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Paddle_baseline_KDD2019
-Paddle baseline for KDD2019 "Context-Aware Multi-Modal Transportation Recommendation"(https://dianshi.baidu.com/competition/29/question)
-
-This repository is the demo codes for the  KDD2019 "Context-Aware Multi-Modal Transportation Recommendation" competition using PaddlePaddle. It is written by python and uses PaddlePaddle to solve the task. Note that this repository is on developing and welcome everyone to contribute. The current baseline solution codes can get 0.68 - 0.69 score of online submission. As an example, my submission based on these networks programmed by PaddlePaddle is 0.6898.
-The reason of the publication of this baseline codes is to encourage us to use PaddlePaddle and build the most powerful recommendation model via PaddlePaddle. 
-
-The example codes are ran on Linux, python2.7, single machine with CPU . Note that distributed train options are not provided here, if you want to learn more about this, please check more modes examples on https://github.com/PaddlePaddle/models. About the speed of training, for one epoch, 1000 batch size, it would take about 8 mins to train the whole training instances generated from raw data using SGD optimizer (it would take relatively longer using Adam optimizer). 
-
-The configuration and process of all the networks are fundamental, a lot of optimizations can be done based on them to achieve better results e.g. better cost function, more powerful feature engineering, designed model validation, NN optimization tricks...
-
-The code is rough and from my daily use. They will be trimmed these days...
-## Install PaddlePaddle
-please visit the official site of PaddlePaddle(http://www.paddlepaddle.org/documentation/docs/zh/1.4/beginners_guide/install/index_cn.html) 
-## preprocess feature
-```python
-python preprocess_dense.py # change for different feature strategy
-python pre_test_dense.py 
-```
-preprocess.py and preprocess_dense.py is the code for preprocessing the raw data. Two versions are provided to deal with all sparse features and sparse plus dense features. Correspondingly, pre_process_test.py and pre_test_dense.py are the codes to preproccess test raw data. The training instances are saved in json. It is very easy to add new features. In our demo, all features are generated from provided raw data except for weather feature, which is gengerated from open weather records.
-Note that the feature generated in this step need to fit in the input of the model input. Make sure we use the right version. In demo codes, The sparse plus dense features are used for network_confv6. 
-
-## build the network
-main network logic is in network_confv?.py. The networks are base on fm & deep related algorithms. I try several networks and public some of them. There may be some defects in the networks but all of them are functional. 
-
-## train the network
-```python
-python local_train.py
-```
-In local_train.py and map_reader.py, I use dataset API, so we need to download the corresponding .whl package or clone codes on develop branch of PaddlePaddle. The reason to use this is the speed of feeding data is much faster.
-Note that the input format feed into the network is self-defined. make sure we build the same format between training and test.  
-
-## test results
-```python
-python generate_test.py
-python build_submit.py
-```
-In generate_test.py and build_submit, for convenience, I use the whole train data to train the network and test the network with provided data without label
-
-
-
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/args.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/args.py
deleted file mode 100644
index 55745918ead086db1a0a9f80db673f9353b06ecd..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/args.py
+++ /dev/null
@@ -1,85 +0,0 @@
-import argparse
-
-def parse_args():
-        parser = argparse.ArgumentParser(description="PaddlePaddle CTR example")
-        parser.add_argument(
-            '--train_data_path',
-            type=str,
-            default='./data/raw/train.txt',
-            help="The path of training dataset")
-        parser.add_argument(
-            '--test_data_path',
-            type=str,
-            default='./data/raw/valid.txt',
-            help="The path of testing dataset")
-        parser.add_argument(
-            '--batch_size',
-            type=int,
-            default=1000,
-            help="The size of mini-batch (default:1000)")
-        parser.add_argument(
-            '--embedding_size',
-            type=int,
-            default=16,
-            help="The size for embedding layer (default:10)")
-        parser.add_argument(
-            '--num_passes',
-            type=int,
-            default=10,
-            help="The number of passes to train (default: 10)")
-        parser.add_argument(
-            '--model_output_dir',
-            type=str,
-            default='models',
-            help='The path for model to store (default: models)')
-        parser.add_argument(
-            '--sparse_feature_dim',
-            type=int,
-            default=1000001,
-            help='sparse feature hashing space for index processing')
-        parser.add_argument(
-            '--is_local',
-            type=int,
-            default=1,
-            help='Local train or distributed train (default: 1)')
-        parser.add_argument(
-            '--cloud_train',
-            type=int,
-            default=0,
-            help='Local train or distributed train on paddlecloud (default: 0)')
-        parser.add_argument(
-            '--async_mode',
-            action='store_true',
-            default=False,
-            help='Whether start pserver in async mode to support ASGD')
-        parser.add_argument(
-            '--no_split_var',
-            action='store_true',
-            default=False,
-            help='Whether split variables into blocks when update_method is pserver')
-        parser.add_argument(
-            '--role',
-            type=str,
-            default='pserver', # trainer or pserver
-            help='The path for model to store (default: models)')
-        parser.add_argument(
-            '--endpoints',
-            type=str,
-            default='127.0.0.1:6000',
-            help='The pserver endpoints, like: 127.0.0.1:6000,127.0.0.1:6001')
-        parser.add_argument(
-            '--current_endpoint',
-            type=str,
-            default='127.0.0.1:6000',
-            help='The path for model to store (default: 127.0.0.1:6000)')
-        parser.add_argument(
-            '--trainer_id',
-            type=int,
-            default=0,
-            help='The path for model to store (default: models)')
-        parser.add_argument(
-            '--trainers',
-            type=int,
-            default=1,
-            help='The num of trianers, (default: 1)')
-        return parser.parse_args()
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/build_submit.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/build_submit.py
deleted file mode 100644
index b4ffc498a2e5f1f7ccae6b99974258878a62fa04..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/build_submit.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import json
-import csv
-import io
-
-
-def build():
-    submit_map = {}
-    with io.open('./submit/submit.csv', 'wb') as csv_file:
-        writer = csv.writer(csv_file, delimiter=',')
-        writer.writerow(['sid', 'recommend_mode'])
-        with open('./out/normed_test_session.txt', 'r') as f1:
-            with open('./testres/res8', 'r') as f2:
-                cur_session =''
-                for x, y in zip(f1.readlines(), f2.readlines()):
-                    m1 = json.loads(x)
-                    session_id = m1["session_id"]
-                    if cur_session == '':
-                        cur_session = session_id
-
-                    transport_mode = m1["plan"]["transport_mode"]
-
-                    if cur_session != session_id:
-                        writer.writerow([str(cur_session), str(submit_map[cur_session]["transport_mode"])])
-                        cur_session = session_id
-                    if session_id not in submit_map:
-                        submit_map[session_id] = {}
-                        submit_map[session_id]["transport_mode"] = transport_mode
-                        submit_map[session_id]["probability"] = y
-                        #if int(submit_map[session_id]["transport_mode"]) == 0 and submit_map[session_id]["probability"] > 0.02:
-                            #submit_map[session_id]["probability"] = 0.99
-                    else:
-                        if float(y) > float(submit_map[session_id]["probability"]):
-                            submit_map[session_id]["transport_mode"] = transport_mode
-                            submit_map[session_id]["probability"] = y
-                            #if int(submit_map[session_id]["transport_mode"]) == 0 and submit_map[session_id]["probability"] > 0.02:
-                                #submit_map[session_id]["transport_mode"] = 0
-                                #submit_map[session_id]["probability"] = 0.99
-
-
-        writer.writerow([cur_session, submit_map[cur_session]["transport_mode"]])
-
-
-
-if __name__ == "__main__":
-    build()
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._profiles.csv b/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._profiles.csv
deleted file mode 100644
index 3e7a69fbd3cccda1242c8056ce0f887f97775226..0000000000000000000000000000000000000000
Binary files a/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._profiles.csv and /dev/null differ
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._test_plans.csv b/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._test_plans.csv
deleted file mode 100755
index 6070f2260eb3d5a7069741ec4ebb7e0ef57d13f3..0000000000000000000000000000000000000000
Binary files a/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._test_plans.csv and /dev/null differ
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._test_queries.csv b/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._test_queries.csv
deleted file mode 100755
index 98060354f5bf046fff792ca359aecac750cce85e..0000000000000000000000000000000000000000
Binary files a/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._test_queries.csv and /dev/null differ
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._train_clicks.csv b/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._train_clicks.csv
deleted file mode 100755
index 35f23f78ab5c254ccc1e631b8736a2854f5dfefa..0000000000000000000000000000000000000000
Binary files a/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._train_clicks.csv and /dev/null differ
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._train_queries.csv b/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._train_queries.csv
deleted file mode 100755
index 3359b92b109fd1c12c60e1757c264c0b6f6a5313..0000000000000000000000000000000000000000
Binary files a/PaddleRec/ctr/Paddle_baseline_KDD2019/data_set_phase1/._train_queries.csv and /dev/null differ
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/generate_test.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/generate_test.py
deleted file mode 100644
index 66bf13d250e5487696177f6d710cd2ec73944d97..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/generate_test.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import numpy as np
-# disable gpu training for this example
-import os
-
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
-import paddle
-import paddle.fluid as fluid
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-num_context_feature = 22
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="PaddlePaddle DeepFM example")
-    parser.add_argument(
-        '--model_path',
-        type=str,
-        #required=True,
-        default='models',
-        help="The path of model parameters gz file")
-    parser.add_argument(
-        '--data_path',
-        type=str,
-        required=False,
-        help="The path of the dataset to infer")
-    parser.add_argument(
-        '--embedding_size',
-        type=int,
-        default=16,
-        help="The size for embedding layer (default:10)")
-    parser.add_argument(
-        '--sparse_feature_dim',
-        type=int,
-        default=1000001,
-        help="The size for embedding layer (default:1000001)")
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=1000,
-        help="The size of mini-batch (default:1000)")
-
-    return parser.parse_args()
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-
-    return res
-
-
-def data2tensor(data, place):
-    feed_dict = {}
-    dense = data[0]
-    sparse = data[1:-1]
-    y = data[-1]
-    #user_data = np.array([x[0] for x in data]).astype("float32")
-    #user_data = user_data.reshape([-1, 10])
-    #feed_dict["user_profile"] = user_data
-    dense_data = np.array([x[0] for x in data]).astype("float32")
-    dense_data = dense_data.reshape([-1, 3])
-    feed_dict["dense_feature"] = dense_data
-    for i in range(num_context_feature):
-        sparse_data = to_lodtensor([x[1 + i] for x in data], place)
-        feed_dict["context" + str(i)] = sparse_data
-
-    context_fm = to_lodtensor(
-        np.array([x[-2] for x in data]).astype("float32"), place)
-
-    feed_dict["context_fm"] = context_fm
-    y_data = np.array([x[-1] for x in data]).astype("int64")
-    y_data = y_data.reshape([-1, 1])
-    feed_dict["label"] = y_data
-    return feed_dict
-
-
-def test():
-    args = parse_args()
-
-    place = fluid.CPUPlace()
-    test_scope = fluid.core.Scope()
-
-    # filelist = ["%s/%s" % (args.data_path, x) for x in os.listdir(args.data_path)]
-    from map_reader import MapDataset
-    map_dataset = MapDataset()
-    map_dataset.setup(args.sparse_feature_dim)
-    exe = fluid.Executor(place)
-
-    whole_filelist = ["./out/normed_test_session.txt"]
-    test_files = whole_filelist[int(0.0 * len(whole_filelist)):int(1.0 * len(
-        whole_filelist))]
-
-    epochs = 1
-
-    for i in range(epochs):
-        cur_model_path = os.path.join(args.model_path,
-                                      "epoch" + str(1) + ".model")
-        with open("./testres/res" + str(i), 'w') as r:
-            with fluid.scope_guard(test_scope):
-                [inference_program, feed_target_names, fetch_targets] = \
-                    fluid.io.load_inference_model(cur_model_path, exe)
-
-                test_reader = map_dataset.test_reader(test_files, 1000, 100000)
-                k = 0
-                for batch_id, data in enumerate(test_reader()):
-                    print(len(data[0]))
-                    feed_dict = data2tensor(data, place)
-                    loss_val, auc_val, accuracy, predict, _ = exe.run(
-                        inference_program,
-                        feed=feed_dict,
-                        fetch_list=fetch_targets,
-                        return_numpy=False)
-
-                    x = np.array(predict)
-                    for j in range(x.shape[0]):
-                        r.write(str(x[j][1]))
-                        r.write("\n")
-
-
-if __name__ == '__main__':
-    test()
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/infer.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/infer.py
deleted file mode 100644
index c218ce0fccc94ee595fd1681b54258d8d6ce43c0..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/infer.py
+++ /dev/null
@@ -1,141 +0,0 @@
-import argparse
-import logging
-
-import numpy as np
-# disable gpu training for this example
-import os
-
-os.environ["CUDA_VISIBLE_DEVICES"] = ""
-import paddle
-import paddle.fluid as fluid
-
-import map_reader
-from network_conf import ctr_deepfm_dataset
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="PaddlePaddle DeepFM example")
-    parser.add_argument(
-        '--model_path',
-        type=str,
-        #required=True,
-        default='models',
-        help="The path of model parameters gz file")
-    parser.add_argument(
-        '--data_path',
-        type=str,
-        required=False,
-        help="The path of the dataset to infer")
-    parser.add_argument(
-        '--embedding_size',
-        type=int,
-        default=16,
-        help="The size for embedding layer (default:10)")
-    parser.add_argument(
-        '--sparse_feature_dim',
-        type=int,
-        default=1000001,
-        help="The size for embedding layer (default:1000001)")
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=1000,
-        help="The size of mini-batch (default:1000)")
-
-    return parser.parse_args()
-
-
-def to_lodtensor(data, place):
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def data2tensor(data, place):
-    feed_dict = {}
-    test_dict = {}
-    dense = data[0]
-    sparse = data[1:-1]
-    y = data[-1]
-    dense_data = np.array([x[0] for x in data]).astype("float32")
-    dense_data = dense_data.reshape([-1, 65])
-    feed_dict["user_profile"] = dense_data
-    for i in range(10):
-        sparse_data = to_lodtensor([x[1 + i] for x in data], place)
-        feed_dict["context" + str(i)] = sparse_data
-
-    y_data = np.array([x[-1] for x in data]).astype("int64")
-    y_data = y_data.reshape([-1, 1])
-    feed_dict["label"] = y_data
-    test_dict["test"] = [1]
-    return feed_dict, test_dict
-
-
-def infer():
-    args = parse_args()
-
-    place = fluid.CPUPlace()
-    inference_scope = fluid.core.Scope()
-
-    filelist = [
-        "%s/%s" % (args.data_path, x) for x in os.listdir(args.data_path)
-    ]
-    from map_reader import MapDataset
-    map_dataset = MapDataset()
-    map_dataset.setup(args.sparse_feature_dim)
-    exe = fluid.Executor(place)
-
-    whole_filelist = [
-        "raw_data/part-%d" % x for x in range(len(os.listdir("raw_data")))
-    ]
-    #whole_filelist = ["./out/normed_train09",  "./out/normed_train10",  "./out/normed_train11"]
-    test_files = whole_filelist[int(0.0 * len(whole_filelist)):int(1.0 * len(
-        whole_filelist))]
-
-    # file_groups = [whole_filelist[i:i+train_thread_num] for i in range(0, len(whole_filelist), train_thread_num)]
-
-    def set_zero(var_name):
-        param = inference_scope.var(var_name).get_tensor()
-        param_array = np.zeros(param._get_dims()).astype("int64")
-        param.set(param_array, place)
-
-    epochs = 2
-    for i in range(epochs):
-        cur_model_path = os.path.join(args.model_path,
-                                      "epoch" + str(i + 1) + ".model")
-        with fluid.scope_guard(inference_scope):
-            [inference_program, feed_target_names, fetch_targets] = \
-                fluid.io.load_inference_model(cur_model_path, exe)
-            auc_states_names = ['_generated_var_2', '_generated_var_3']
-            for name in auc_states_names:
-                set_zero(name)
-
-            test_reader = map_dataset.infer_reader(test_files, 1000, 100000)
-            for batch_id, data in enumerate(test_reader()):
-                loss_val, auc_val, accuracy, predict, label = exe.run(
-                    inference_program,
-                    feed=data2tensor(data, place),
-                    fetch_list=fetch_targets,
-                    return_numpy=False)
-
-                #print(np.array(predict))
-                #x = np.array(predict)
-                #print(.shape)x
-            #print("train_pass_%d, test_pass_%d\t%f\t" % (i - 1, i, auc_val))
-
-
-if __name__ == '__main__':
-    infer()
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/local_train.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/local_train.py
deleted file mode 100644
index 9d7e9452a14e08d293d77bc41fc2806ca9c0d1a2..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/local_train.py
+++ /dev/null
@@ -1,90 +0,0 @@
-from __future__ import print_function
-
-from args import parse_args
-import os
-import paddle.fluid as fluid
-import sys
-from network_confv6 import ctr_deepfm_dataset
-
-NUM_CONTEXT_FEATURE = 22
-DIM_USER_PROFILE = 10
-DIM_DENSE_FEATURE = 3
-PYTHON_PATH = "/home/yaoxuefeng/whls/paddle_release_home/python/bin/python"  # this is mine change yours
-
-
-def train():
-    args = parse_args()
-    if not os.path.isdir(args.model_output_dir):
-        os.mkdir(args.model_output_dir)
-
-    #set the input format for our model. Note that you need to carefully modify them when you define a new network
-    #user_profile = fluid.layers.data(
-    #name="user_profile", shape=[DIM_USER_PROFILE], dtype='int64', lod_level=1)
-    dense_feature = fluid.layers.data(
-        name="dense_feature", shape=[DIM_DENSE_FEATURE], dtype='float32')
-    context_feature = [
-        fluid.layers.data(
-            name="context" + str(i), shape=[1], lod_level=1, dtype="int64")
-        for i in range(0, NUM_CONTEXT_FEATURE)
-    ]
-    context_feature_fm = fluid.layers.data(
-        name="context_fm", shape=[1], dtype='int64', lod_level=1)
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    print("ready to network")
-    #self define network 
-    loss, auc_var, batch_auc_var, accuracy, predict = ctr_deepfm_dataset(
-        dense_feature, context_feature, context_feature_fm, label,
-        args.embedding_size, args.sparse_feature_dim)
-
-    print("ready to optimize")
-    optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
-    optimizer.minimize(loss)
-    #single machine CPU training. more options on trainig please visit PaddlePaddle site
-    exe = fluid.Executor(fluid.CPUPlace())
-    exe.run(fluid.default_startup_program())
-    #use dataset api for much faster speed
-    dataset = fluid.DatasetFactory().create_dataset()
-    dataset.set_use_var([dense_feature] + context_feature +
-                        [context_feature_fm] + [label])
-    #self define how to process generated training insatnces in map_reader.py
-    pipe_command = PYTHON_PATH + "  map_reader.py %d" % args.sparse_feature_dim
-    dataset.set_pipe_command(pipe_command)
-    dataset.set_batch_size(args.batch_size)
-    thread_num = 1
-    dataset.set_thread(thread_num)
-    #self define how to split training files for example:"split -a 2 -d -l 200000 normed_train.txt normed_train"
-    whole_filelist = [
-        "./out/normed_train%d" % x for x in range(len(os.listdir("out")))
-    ]
-    whole_filelist = [
-        "./out/normed_train00", "./out/normed_train01", "./out/normed_train02",
-        "./out/normed_train03", "./out/normed_train04", "./out/normed_train05",
-        "./out/normed_train06", "./out/normed_train07", "./out/normed_train08",
-        "./out/normed_train09", "./out/normed_train10", "./out/normed_train11"
-    ]
-    print("ready to epochs")
-    epochs = 10
-    for i in range(epochs):
-        print("start %dth epoch" % i)
-        dataset.set_filelist(whole_filelist[:int(len(whole_filelist))])
-        #print the informations you want by setting fetch_list and fetch_info
-        exe.train_from_dataset(
-            program=fluid.default_main_program(),
-            dataset=dataset,
-            fetch_list=[auc_var, accuracy, predict, label],
-            fetch_info=["auc", "accuracy", "predict", "label"],
-            debug=False)
-        model_dir = os.path.join(args.model_output_dir,
-                                 '/epoch' + str(i + 1) + ".model")
-        sys.stderr.write("epoch%d finished" % (i + 1))
-        #save model
-        fluid.io.save_inference_model(
-            model_dir,
-            [dense_feature.name] + [x.name for x in context_feature] +
-            [context_feature_fm.name] + [label.name],
-            [loss, auc_var, accuracy, predict, label], exe)
-
-
-if __name__ == '__main__':
-    train()
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/map_reader.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/map_reader.py
deleted file mode 100644
index 4a07e512d01c66e5bd778c0ab11a00f78fb138a0..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/map_reader.py
+++ /dev/null
@@ -1,145 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import json
-import paddle.fluid.incubate.data_generator as dg
-
-
-class MapDataset(dg.MultiSlotDataGenerator):
-    def setup(self, sparse_feature_dim):
-        self.profile_length = 65
-        self.dense_length = 3
-        #feature names
-        self.dense_feature_list = ["distance", "price", "eta"]
-
-        self.pid_list = ["pid"]
-        self.query_feature_list = ["weekday", "hour", "o1", "o2", "d1", "d2"]
-        self.plan_feature_list = ["transport_mode"]
-        self.rank_feature_list = ["plan_rank", "whole_rank", "price_rank", "eta_rank", "distance_rank"]
-        self.rank_whole_pic_list = ["mode_rank1", "mode_rank2", "mode_rank3", "mode_rank4",
-                                    "mode_rank5"]
-        self.weather_feature_list = ["max_temp", "min_temp", "wea", "wind"]
-        self.hash_dim = 1000001
-        self.train_idx_ = 2000000
-        #carefully set if you change the features 
-        self.categorical_range_ = range(0, 22)
-
-    #process one instance
-    def _process_line(self, line):
-        instance = json.loads(line)
-        """
-        profile = instance["profile"]
-        len_profile = len(profile)
-        if len_profile >= 10:
-            user_profile_feature = profile[0:10]
-        else:
-            profile.extend([0]*(10-len_profile))
-            user_profile_feature = profile
-        
-        if len(profile) > 1 or (len(profile) == 1 and profile[0] != 0):
-            for p in profile:
-                if p >= 1 and p <= 65:
-                    user_profile_feature[p - 1] = 1
-        """
-        context_feature = []
-        context_feature_fm = []
-        dense_feature = [0] * self.dense_length
-        plan = instance["plan"]
-        for i, val in enumerate(self.dense_feature_list):
-            dense_feature[i] = plan[val]
-
-        if (instance["pid"] == ""):
-            instance["pid"] = 0
-
-        query = instance["query"]
-        weather_dic = instance["weather"]
-        for fea in self.pid_list:
-            context_feature.append([hash(fea + str(instance[fea])) % self.hash_dim])
-            context_feature_fm.append(hash(fea + str(instance[fea])) % self.hash_dim)
-        for fea in self.query_feature_list:
-            context_feature.append([hash(fea + str(query[fea])) % self.hash_dim])
-            context_feature_fm.append(hash(fea + str(query[fea])) % self.hash_dim)
-        for fea in self.plan_feature_list:
-            context_feature.append([hash(fea + str(plan[fea])) % self.hash_dim])
-            context_feature_fm.append(hash(fea + str(plan[fea])) % self.hash_dim)
-        for fea in self.rank_feature_list:
-            context_feature.append([hash(fea + str(instance[fea])) % self.hash_dim])
-            context_feature_fm.append(hash(fea + str(instance[fea])) % self.hash_dim)
-        for fea in self.rank_whole_pic_list:
-            context_feature.append([hash(fea + str(instance[fea])) % self.hash_dim])
-            context_feature_fm.append(hash(fea + str(instance[fea])) % self.hash_dim)
-        for fea in self.weather_feature_list:
-            context_feature.append([hash(fea + str(weather_dic[fea])) % self.hash_dim])
-            context_feature_fm.append(hash(fea + str(weather_dic[fea])) % self.hash_dim)
-
-        label = [int(instance["label"])]
-
-        return dense_feature, context_feature, context_feature_fm, label
-
-    def infer_reader(self, filelist, batch, buf_size):
-        print(filelist)
-
-        def local_iter():
-            for fname in filelist:
-                with open(fname.strip(), "r") as fin:
-                    for line in fin:
-                        dense_feature, sparse_feature, sparse_feature_fm, label = self._process_line(line)
-                        yield [dense_feature] + sparse_feature + [sparse_feature_fm] + [label]
-
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.shuffle(
-                local_iter, buf_size=buf_size),
-            batch_size=batch)
-        return batch_iter
-
-    #generat inputs for testing
-    def test_reader(self, filelist, batch, buf_size):
-        print(filelist)
-
-        def local_iter():
-            for fname in filelist:
-                with open(fname.strip(), "r") as fin:
-                    for line in fin:
-                        dense_feature, sparse_feature, sparse_feature_fm, label = self._process_line(line)
-                        yield [dense_feature] + sparse_feature + [sparse_feature_fm] + [label]
-
-        import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.buffered(
-                local_iter, size=buf_size),
-            batch_size=batch)
-        return batch_iter
-
-    #generate inputs for trainig 
-    def generate_sample(self, line):
-        def data_iter():
-            dense_feature, sparse_feature, sparse_feature_fm, label = self._process_line(line)
-            #feature_name = ["user_profile"]
-            feature_name = []
-            feature_name.append("dense_feature")
-            for idx in self.categorical_range_:
-                feature_name.append("context" + str(idx))
-            feature_name.append("context_fm")
-            feature_name.append("label")
-            yield zip(feature_name, [dense_feature] + sparse_feature + [sparse_feature_fm] + [label])
-
-        return data_iter
-
-
-if __name__ == "__main__":
-    map_dataset = MapDataset()
-    map_dataset.setup(int(sys.argv[1]))
-    map_dataset.run_from_stdin()
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/network_confv6.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/network_confv6.py
deleted file mode 100644
index 5a9a5d4b18dd99133b50b143da451495735311dd..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/network_confv6.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import math
-
-user_profile_dim = 65
-dense_feature_dim = 3
-
-def ctr_deepfm_dataset(dense_feature, context_feature, context_feature_fm, label,
-                       embedding_size, sparse_feature_dim):
-    def dense_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
-
-        first_order = fluid.layers.fc(input=input, size=1)
-        emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
-                                                  dtype='float32', attr=fm_param_attr)
-
-        input_mul_factor = fluid.layers.matmul(input, emb_table)
-        input_mul_factor_square = fluid.layers.square(input_mul_factor)
-        input_square = fluid.layers.square(input)
-        factor_square = fluid.layers.square(emb_table)
-        input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
-
-        second_order = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
-        return first_order, second_order
-
-
-    dense_fm_param_attr = fluid.param_attr.ParamAttr(name="DenseFeatFactors",
-                                                     initializer=fluid.initializer.Normal(
-                                                         scale=1 / math.sqrt(dense_feature_dim)))
-    dense_fm_first, dense_fm_second = dense_fm_layer(
-        dense_feature, dense_feature_dim, 16, dense_fm_param_attr)
-
-
-    def sparse_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
-
-        first_embeddings = fluid.layers.embedding(
-            input=input, dtype='float32', size=[emb_dict_size, 1], is_sparse=True)
-        first_order = fluid.layers.sequence_pool(input=first_embeddings, pool_type='sum')
-
-        nonzero_embeddings = fluid.layers.embedding(
-            input=input, dtype='float32', size=[emb_dict_size, factor_size],
-            param_attr=fm_param_attr, is_sparse=True)
-        summed_features_emb = fluid.layers.sequence_pool(input=nonzero_embeddings, pool_type='sum')
-        summed_features_emb_square = fluid.layers.square(summed_features_emb)
-
-        squared_features_emb = fluid.layers.square(nonzero_embeddings)
-        squared_sum_features_emb = fluid.layers.sequence_pool(
-            input=squared_features_emb, pool_type='sum')
-
-        second_order = 0.5 * (summed_features_emb_square - squared_sum_features_emb)
-        return first_order, second_order
-
-    sparse_fm_param_attr = fluid.param_attr.ParamAttr(name="SparseFeatFactors",
-                                                      initializer=fluid.initializer.Normal(
-                                                          scale=1 / math.sqrt(sparse_feature_dim)))
-
-    #data = fluid.layers.data(name='ids', shape=[1], dtype='float32')
-    sparse_fm_first, sparse_fm_second = sparse_fm_layer(
-        context_feature_fm, sparse_feature_dim, 16, sparse_fm_param_attr)
-
-    def embedding_layer(input):
-        return fluid.layers.embedding(
-            input=input,
-            is_sparse=True,
-            # you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190
-            # if you want to set is_distributed to True
-            is_distributed=False,
-            size=[sparse_feature_dim, embedding_size],
-            param_attr=fluid.ParamAttr(name="SparseFeatFactors",
-                                       initializer=fluid.initializer.Uniform()))
-
-    sparse_embed_seq = list(map(embedding_layer, context_feature))
-
-    concated_ori = fluid.layers.concat(sparse_embed_seq + [dense_feature], axis=1)
-    concated = fluid.layers.batch_norm(input=concated_ori, name="bn", epsilon=1e-4)
-
-    deep = deep_net(concated)
-
-    predict = fluid.layers.fc(input=[deep, sparse_fm_first, sparse_fm_second, dense_fm_first, dense_fm_second], size=2, act="softmax",
-                              param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
-                                  scale=1 / math.sqrt(deep.shape[1])), learning_rate=0.01))
-
-    #similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(predict, min=-15.0, max=15.0), name="similarity_norm")
-
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-
-    avg_cost = fluid.layers.reduce_sum(cost)
-    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    auc_var, batch_auc_var, auc_states = \
-        fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
-    return avg_cost, auc_var, batch_auc_var, accuracy, predict
-
-
-def deep_net(concated, lr_x=0.0001):
-    fc_layers_input = [concated]
-    fc_layers_size = [400, 400, 400]
-    fc_layers_act = ["relu"] * (len(fc_layers_size))
-
-    for i in range(len(fc_layers_size)):
-        fc = fluid.layers.fc(
-            input=fc_layers_input[-1],
-            size=fc_layers_size[i],
-            act=fc_layers_act[i],
-            param_attr=fluid.ParamAttr(learning_rate=lr_x * 0.5))
-
-        fc_layers_input.append(fc)
-    #w_res = fluid.layers.create_parameter(shape=[353, 16], dtype='float32', name="w_res")
-    #high_path = fluid.layers.matmul(concated, w_res)
-
-    #return fluid.layers.elementwise_add(high_path, fc_layers_input[-1])
-    return fc_layers_input[-1]
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/networks/network_conf.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/networks/network_conf.py
deleted file mode 100644
index ef6d0103fe26b10cd74047cb8de2c8442b13cbff..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/networks/network_conf.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import paddle.fluid as fluid
-import math
-
-user_profile_dim = 65
-num_context = 25
-dim_fm_vector = 16
-dim_concated = user_profile_dim + dim_fm_vector * (num_context)
-
-
-def ctr_deepfm_dataset(user_profile, context_feature, label,
-                       embedding_size, sparse_feature_dim):
-    def embedding_layer(input):
-        return fluid.layers.embedding(
-            input=input,
-            is_sparse=True,
-            # you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190
-            # if you want to set is_distributed to True
-            is_distributed=False,
-            size=[sparse_feature_dim, embedding_size],
-            param_attr=fluid.ParamAttr(name="SparseFeatFactors",
-                                       initializer=fluid.initializer.Uniform()))
-
-    sparse_embed_seq = list(map(embedding_layer, context_feature))
-
-    w = fluid.layers.create_parameter(
-        shape=[65, 65], dtype='float32',
-        name="w_fm")
-    user_profile_emb = fluid.layers.matmul(user_profile, w)
-
-    concated_ori = fluid.layers.concat(sparse_embed_seq + [user_profile_emb], axis=1)
-    concated = fluid.layers.batch_norm(input=concated_ori, name="bn", epsilon=1e-4)
-
-    deep = deep_net(concated)
-    linear_term, second_term = fm(concated, dim_concated, 8) #depend on the number of context feature
-
-    predict = fluid.layers.fc(input=[deep, linear_term, second_term], size=2, act="softmax",
-                              param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
-                                  scale=1 / math.sqrt(deep.shape[1])), learning_rate=0.01))
-
-    #similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(predict, min=-15.0, max=15.0), name="similarity_norm")
-
-
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-
-    avg_cost = fluid.layers.reduce_sum(cost)
-    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    auc_var, batch_auc_var, auc_states = \
-        fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
-    return avg_cost, auc_var, batch_auc_var, accuracy, predict
-
-
-def deep_net(concated, lr_x=0.0001):
-    fc_layers_input = [concated]
-    fc_layers_size = [128, 64, 32, 16]
-    fc_layers_act = ["relu"] * (len(fc_layers_size))
-
-    for i in range(len(fc_layers_size)):
-        fc = fluid.layers.fc(
-            input=fc_layers_input[-1],
-            size=fc_layers_size[i],
-            act=fc_layers_act[i],
-            param_attr=fluid.ParamAttr(learning_rate=lr_x * 0.5))
-
-        fc_layers_input.append(fc)
-
-    return fc_layers_input[-1]
-
-
-def fm(concated, emb_dict_size, factor_size, lr_x=0.0001):
-    linear_term = fluid.layers.fc(input=concated, size=8, act=None, param_attr=fluid.ParamAttr(learning_rate=lr_x))
-
-    emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
-                                                  dtype='float32')
-
-    input_mul_factor = fluid.layers.matmul(concated, emb_table)
-    input_mul_factor_square = fluid.layers.square(input_mul_factor)
-    input_square = fluid.layers.square(concated)
-    factor_square = fluid.layers.square(emb_table)
-    input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
-
-    second_term = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
-
-    return linear_term, second_term
-
-
-
-
-
-
-
-
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/networks/network_confv4.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/networks/network_confv4.py
deleted file mode 100644
index 9700016813b6cef7ed5c9b37c63ebecee6f66f32..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/networks/network_confv4.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import math
-
-user_profile_dim = 65
-slot_1 = [0, 1, 2, 3, 4, 5]
-slot_2 = [6]
-slot_3 = [7, 8, 9, 10, 11]
-slot_4 = [12, 13, 14, 15, 16]
-slot_5 = [17, 18, 19, 20]
-num_context = 25
-num_slots_pair = 5
-dim_fm_vector = 16
-dim_concated = user_profile_dim + dim_fm_vector * (num_context + num_slots_pair)
-
-def ctr_deepfm_dataset(user_profile, dense_feature, context_feature, label,
-                       embedding_size, sparse_feature_dim):
-    def embedding_layer(input):
-        return fluid.layers.embedding(
-            input=input,
-            is_sparse=True,
-            # you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190
-            # if you want to set is_distributed to True
-            is_distributed=False,
-            size=[sparse_feature_dim, embedding_size],
-            param_attr=fluid.ParamAttr(name="SparseFeatFactors",
-                                       initializer=fluid.initializer.Uniform()))
-
-    sparse_embed_seq = list(map(embedding_layer, context_feature))
-
-    w = fluid.layers.create_parameter(
-        shape=[65, 65], dtype='float32',
-        name="w_fm")
-
-    user_emb_list = []
-    user_profile_emb = fluid.layers.matmul(user_profile, w)
-    user_emb_list.append(user_profile_emb)
-    user_emb_list.append(dense_feature)
-
-    w1 = fluid.layers.create_parameter(shape=[65, dim_fm_vector], dtype='float32', name="w_1")
-    w2 = fluid.layers.create_parameter(shape=[65, dim_fm_vector], dtype='float32', name="w_2")
-    w3 = fluid.layers.create_parameter(shape=[65, dim_fm_vector], dtype='float32', name="w_3")
-    w4 = fluid.layers.create_parameter(shape=[65, dim_fm_vector], dtype='float32', name="w_4")
-    w5 = fluid.layers.create_parameter(shape=[65, dim_fm_vector], dtype='float32', name="w_5")
-    user_profile_emb_1 = fluid.layers.matmul(user_profile, w1)
-    user_profile_emb_2 = fluid.layers.matmul(user_profile, w2)
-    user_profile_emb_3 = fluid.layers.matmul(user_profile, w3)
-    user_profile_emb_4 = fluid.layers.matmul(user_profile, w4)
-    user_profile_emb_5 = fluid.layers.matmul(user_profile, w5)
-
-    sparse_embed_seq_1 = embedding_layer(context_feature[slot_1[0]])
-    sparse_embed_seq_2 = embedding_layer(context_feature[slot_2[0]])
-    sparse_embed_seq_3 = embedding_layer(context_feature[slot_3[0]])
-    sparse_embed_seq_4 = embedding_layer(context_feature[slot_4[0]])
-    sparse_embed_seq_5 = embedding_layer(context_feature[slot_5[0]])
-    for i in slot_1[1:-1]:
-        sparse_embed_seq_1 = fluid.layers.elementwise_add(sparse_embed_seq_1, embedding_layer(context_feature[i]))
-    for i in slot_2[1:-1]:
-        sparse_embed_seq_2 = fluid.layers.elementwise_add(sparse_embed_seq_2, embedding_layer(context_feature[i]))
-    for i in slot_3[1:-1]:
-        sparse_embed_seq_3 = fluid.layers.elementwise_add(sparse_embed_seq_3, embedding_layer(context_feature[i]))
-    for i in slot_4[1:-1]:
-        sparse_embed_seq_4 = fluid.layers.elementwise_add(sparse_embed_seq_4, embedding_layer(context_feature[i]))
-    for i in slot_5[1:-1]:
-        sparse_embed_seq_5 = fluid.layers.elementwise_add(sparse_embed_seq_5, embedding_layer(context_feature[i]))
-
-    ele_product_1 = fluid.layers.elementwise_mul(user_profile_emb_1, sparse_embed_seq_1)
-    user_emb_list.append(ele_product_1)
-    ele_product_2 = fluid.layers.elementwise_mul(user_profile_emb_2, sparse_embed_seq_2)
-    user_emb_list.append(ele_product_2)
-    ele_product_3 = fluid.layers.elementwise_mul(user_profile_emb_3, sparse_embed_seq_3)
-    user_emb_list.append(ele_product_3)
-    ele_product_4 = fluid.layers.elementwise_mul(user_profile_emb_4, sparse_embed_seq_4)
-    user_emb_list.append(ele_product_4)
-    ele_product_5 = fluid.layers.elementwise_mul(user_profile_emb_5, sparse_embed_seq_5)
-    user_emb_list.append(ele_product_5)
-
-    ffm_1 = fluid.layers.reduce_sum(ele_product_1, dim=1, keep_dim=True)
-    ffm_2 = fluid.layers.reduce_sum(ele_product_2, dim=1, keep_dim=True)
-    ffm_3 = fluid.layers.reduce_sum(ele_product_3, dim=1, keep_dim=True)
-    ffm_4 = fluid.layers.reduce_sum(ele_product_4, dim=1, keep_dim=True)
-    ffm_5 = fluid.layers.reduce_sum(ele_product_5, dim=1, keep_dim=True)
-
-
-    concated_ori = fluid.layers.concat(sparse_embed_seq + user_emb_list, axis=1)
-    concated = fluid.layers.batch_norm(input=concated_ori, name="bn", epsilon=1e-4)
-
-    deep = deep_net(concated)
-    linear_term, second_term = fm(concated, dim_concated, 8) #depend on the number of context feature
-
-    predict = fluid.layers.fc(input=[deep, linear_term, second_term, ffm_1, ffm_2, ffm_3, ffm_4, ffm_5], size=2, act="softmax",
-                              param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
-                                  scale=1 / math.sqrt(deep.shape[1])), learning_rate=0.01))
-
-    #similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(predict, min=-15.0, max=15.0), name="similarity_norm")
-
-
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-
-    avg_cost = fluid.layers.reduce_sum(cost)
-    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    auc_var, batch_auc_var, auc_states = \
-        fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
-    return avg_cost, auc_var, batch_auc_var, accuracy, predict
-
-
-def deep_net(concated, lr_x=0.0001):
-    fc_layers_input = [concated]
-    fc_layers_size = [256, 128, 64, 32, 16]
-    fc_layers_act = ["relu"] * (len(fc_layers_size))
-
-    for i in range(len(fc_layers_size)):
-        fc = fluid.layers.fc(
-            input=fc_layers_input[-1],
-            size=fc_layers_size[i],
-            act=fc_layers_act[i],
-            param_attr=fluid.ParamAttr(learning_rate=lr_x * 0.5))
-
-        fc_layers_input.append(fc)
-    w_res = fluid.layers.create_parameter(shape=[dim_concated, 16], dtype='float32', name="w_res")
-    high_path = fluid.layers.matmul(concated, w_res)
-
-    return fluid.layers.elementwise_add(high_path, fc_layers_input[-1])
-    #return fc_layers_input[-1]
-
-
-def fm(concated, emb_dict_size, factor_size, lr_x=0.0001):
-    linear_term = fluid.layers.fc(input=concated, size=8, act=None, param_attr=fluid.ParamAttr(learning_rate=lr_x))
-
-    emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
-                                                  dtype='float32')
-
-    input_mul_factor = fluid.layers.matmul(concated, emb_table)
-    input_mul_factor_square = fluid.layers.square(input_mul_factor)
-    input_square = fluid.layers.square(concated)
-    factor_square = fluid.layers.square(emb_table)
-    input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
-
-    second_term = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
-
-    return linear_term, second_term
\ No newline at end of file
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/networks/network_confv6.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/networks/network_confv6.py
deleted file mode 100644
index ed638e97a981e1af24388f59e7b446d0e6507749..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/networks/network_confv6.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-import math
-
-user_profile_dim = 65
-dense_feature_dim = 3
-
-def ctr_deepfm_dataset(dense_feature, context_feature, context_feature_fm, label,
-                       embedding_size, sparse_feature_dim):
-    def dense_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
-
-        first_order = fluid.layers.fc(input=input, size=1)
-        emb_table = fluid.layers.create_parameter(shape=[emb_dict_size, factor_size],
-                                                  dtype='float32', attr=fm_param_attr)
-
-        input_mul_factor = fluid.layers.matmul(input, emb_table)
-        input_mul_factor_square = fluid.layers.square(input_mul_factor)
-        input_square = fluid.layers.square(input)
-        factor_square = fluid.layers.square(emb_table)
-        input_square_mul_factor_square = fluid.layers.matmul(input_square, factor_square)
-
-        second_order = 0.5 * (input_mul_factor_square - input_square_mul_factor_square)
-        return first_order, second_order
-
-
-    dense_fm_param_attr = fluid.param_attr.ParamAttr(name="DenseFeatFactors",
-                                                     initializer=fluid.initializer.Normal(
-                                                         scale=1 / math.sqrt(dense_feature_dim)))
-    dense_fm_first, dense_fm_second = dense_fm_layer(
-        dense_feature, dense_feature_dim, 16, dense_fm_param_attr)
-
-
-    def sparse_fm_layer(input, emb_dict_size, factor_size, fm_param_attr):
-
-        first_embeddings = fluid.layers.embedding(
-            input=input, dtype='float32', size=[emb_dict_size, 1], is_sparse=True)
-        first_order = fluid.layers.sequence_pool(input=first_embeddings, pool_type='sum')
-
-        nonzero_embeddings = fluid.layers.embedding(
-            input=input, dtype='float32', size=[emb_dict_size, factor_size],
-            param_attr=fm_param_attr, is_sparse=True)
-        summed_features_emb = fluid.layers.sequence_pool(input=nonzero_embeddings, pool_type='sum')
-        summed_features_emb_square = fluid.layers.square(summed_features_emb)
-
-        squared_features_emb = fluid.layers.square(nonzero_embeddings)
-        squared_sum_features_emb = fluid.layers.sequence_pool(
-            input=squared_features_emb, pool_type='sum')
-
-        second_order = 0.5 * (summed_features_emb_square - squared_sum_features_emb)
-        return first_order, second_order
-
-    sparse_fm_param_attr = fluid.param_attr.ParamAttr(name="SparseFeatFactors",
-                                                      initializer=fluid.initializer.Normal(
-                                                          scale=1 / math.sqrt(sparse_feature_dim)))
-
-    #data = fluid.layers.data(name='ids', shape=[1], dtype='float32')
-    sparse_fm_first, sparse_fm_second = sparse_fm_layer(
-        context_feature_fm, sparse_feature_dim, 16, sparse_fm_param_attr)
-
-    def embedding_layer(input):
-        return fluid.layers.embedding(
-            input=input,
-            is_sparse=True,
-            # you need to patch https://github.com/PaddlePaddle/Paddle/pull/14190
-            # if you want to set is_distributed to True
-            is_distributed=False,
-            size=[sparse_feature_dim, embedding_size],
-            param_attr=fluid.ParamAttr(name="SparseFeatFactors",
-                                       initializer=fluid.initializer.Uniform()))
-
-    sparse_embed_seq = list(map(embedding_layer, context_feature))
-
-    concated_ori = fluid.layers.concat(sparse_embed_seq + [dense_feature], axis=1)
-    concated = fluid.layers.batch_norm(input=concated_ori, name="bn", epsilon=1e-4)
-
-    deep = deep_net(concated)
-
-    predict = fluid.layers.fc(input=[deep, sparse_fm_first, sparse_fm_second, dense_fm_first, dense_fm_second], size=2, act="softmax",
-                              param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
-                                  scale=1 / math.sqrt(deep.shape[1])), learning_rate=0.01))
-
-    #similarity_norm = fluid.layers.sigmoid(fluid.layers.clip(predict, min=-15.0, max=15.0), name="similarity_norm")
-
-    cost = fluid.layers.cross_entropy(input=predict, label=label)
-
-    avg_cost = fluid.layers.reduce_sum(cost)
-    accuracy = fluid.layers.accuracy(input=predict, label=label)
-    auc_var, batch_auc_var, auc_states = \
-        fluid.layers.auc(input=predict, label=label, num_thresholds=2 ** 12, slide_steps=20)
-    return avg_cost, auc_var, batch_auc_var, accuracy, predict
-
-
-def deep_net(concated, lr_x=0.0001):
-    fc_layers_input = [concated]
-    fc_layers_size = [400, 400, 400]
-    fc_layers_act = ["relu"] * (len(fc_layers_size))
-
-    for i in range(len(fc_layers_size)):
-        fc = fluid.layers.fc(
-            input=fc_layers_input[-1],
-            size=fc_layers_size[i],
-            act=fc_layers_act[i],
-            param_attr=fluid.ParamAttr(learning_rate=lr_x * 0.5))
-
-        fc_layers_input.append(fc)
-    #w_res = fluid.layers.create_parameter(shape=[353, 16], dtype='float32', name="w_res")
-    #high_path = fluid.layers.matmul(concated, w_res)
-
-    #return fluid.layers.elementwise_add(high_path, fc_layers_input[-1])
-    return fc_layers_input[-1]
\ No newline at end of file
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/pre_process_test.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/pre_process_test.py
deleted file mode 100644
index 44462a9910f781c20659b3bfd51d5e794e5bed58..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/pre_process_test.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys, time, random, csv, datetime, json
-import pandas as pd
-import numpy as np
-import argparse
-import logging
-import time
-
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("preprocess")
-logger.setLevel(logging.INFO)
-
-TEST_QUERIES_PATH = "./data_set_phase1/test_queries.csv"
-TEST_PLANS_PATH = "./data_set_phase1/test_plans.csv"
-TRAIN_CLICK_PATH = "./data_set_phase1/train_clicks.csv"
-PROFILES_PATH = "./data_set_phase1/profiles.csv"
-OUT_NORM_TEST_PATH = "./out/normed_test_session.txt"
-OUT_RAW_TEST_PATH = "./out/test_session.txt"
-
-O1_MIN = 115.47
-O1_MAX = 117.29
-
-O2_MIN = 39.46
-O2_MAX = 40.97
-
-D1_MIN = 115.44
-D1_MAX = 117.37
-
-D2_MIN = 39.46
-D2_MAX = 40.96
-SCALE_OD = 0.02
-
-DISTANCE_MIN = 1.0
-DISTANCE_MAX = 225864.0
-THRESHOLD_DIS = 40000.0
-SCALE_DIS = 500
-
-PRICE_MIN = 200.0
-PRICE_MAX = 92300.0
-THRESHOLD_PRICE = 20000
-SCALE_PRICE = 100
-
-ETA_MIN = 1.0
-ETA_MAX = 72992.0
-THRESHOLD_ETA = 10800.0
-SCALE_ETA = 120
-
-
-def build_norm_feature():
-    with open(OUT_NORM_TEST_PATH, 'w') as nf:
-        with open(OUT_RAW_TEST_PATH, 'r') as f:
-            for line in f:
-                cur_map = json.loads(line)
-
-                if cur_map["plan"]["distance"] > THRESHOLD_DIS:
-                    cur_map["plan"]["distance"] = int(THRESHOLD_DIS)
-                elif cur_map["plan"]["distance"] > 0:
-                    cur_map["plan"]["distance"] = int(cur_map["plan"]["distance"] / SCALE_DIS)
-
-                if cur_map["plan"]["price"] and cur_map["plan"]["price"] > THRESHOLD_PRICE:
-                    cur_map["plan"]["price"] = int(THRESHOLD_PRICE)
-                elif not cur_map["plan"]["price"] or cur_map["plan"]["price"] < 0:
-                    cur_map["plan"]["price"] = 0
-                else:
-                    cur_map["plan"]["price"] = int(cur_map["plan"]["price"] / SCALE_PRICE)
-
-                if cur_map["plan"]["eta"] > THRESHOLD_ETA:
-                    cur_map["plan"]["eta"] = int(THRESHOLD_ETA)
-                elif cur_map["plan"]["eta"] > 0:
-                    cur_map["plan"]["eta"] = int(cur_map["plan"]["eta"] / SCALE_ETA)
-
-                # o1
-                if cur_map["query"]["o1"] > O1_MAX:
-                    cur_map["query"]["o1"] = int((O1_MAX - O1_MIN) / SCALE_OD + 1)
-                elif cur_map["query"]["o1"] < O1_MIN:
-                    cur_map["query"]["o1"] = 0
-                else:
-                    cur_map["query"]["o1"] = int((cur_map["query"]["o1"] - O1_MIN) / 0.02)
-
-                # o2
-                if cur_map["query"]["o2"] > O2_MAX:
-                    cur_map["query"]["o2"] = int((O2_MAX - O2_MIN) / SCALE_OD + 1)
-                elif cur_map["query"]["o2"] < O2_MIN:
-                    cur_map["query"]["o2"] = 0
-                else:
-                    cur_map["query"]["o2"] = int((cur_map["query"]["o2"] - O2_MIN) / 0.02)
-
-                # d1
-                if cur_map["query"]["d1"] > D1_MAX:
-                    cur_map["query"]["d1"] = int((D1_MAX - D1_MIN) / SCALE_OD + 1)
-                elif cur_map["query"]["d1"] < D1_MIN:
-                    cur_map["query"]["d1"] = 0
-                else:
-                    cur_map["query"]["d1"] = int((cur_map["query"]["d1"] - D1_MIN) / SCALE_OD)
-
-                # d2
-                if cur_map["query"]["d2"] > D2_MAX:
-                    cur_map["query"]["d2"] = int((D2_MAX - D2_MIN) / SCALE_OD + 1)
-                elif cur_map["query"]["d2"] < D2_MIN:
-                    cur_map["query"]["d2"] = 0
-                else:
-                    cur_map["query"]["d2"] = int((cur_map["query"]["d2"] - D2_MIN) / SCALE_OD)
-
-                cur_json_instance = json.dumps(cur_map)
-                nf.write(cur_json_instance + '\n')
-
-
-def preprocess():
-    """
-    Construct the train data indexed by session id and mode id jointly. Convert some of the raw features (user profile,
-    od pair, req time, click time, eta, price, distance, transport mode) to one-hot ids used for
-    embedding. We split the one-hot features into two categories: user feature and context feature for
-    better understanding of FM algorithm.
-    Note that the user profile is already provided by one-hot encoded form, we convert it back to the
-    ids for unity with the context feature and easily using of PaddlePaddle embedding layer. Given the
-    train clicks data, we label each train instance with 1 or 0 depend on if this instance is clicked or
-    not.
-    :return:
-    """
-
-    train_data_dict = {}
-    with open("./weather.json", 'r') as f:
-        weather_dict = json.load(f)
-
-    with open(TEST_QUERIES_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        train_index_list = []
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            if line[0] == "": continue
-            if line[1] == "":
-                train_index_list.append(line[0] + "_0")
-            else:
-                train_index_list.append(line[0] + "_" + line[1])
-
-            train_index = line[0]
-            train_data_dict[train_index] = {}
-            train_data_dict[train_index]["pid"] = line[1]
-            train_data_dict[train_index]["query"] = {}
-
-            reqweekday = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%w")
-            reqhour = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%H")
-
-            date_key = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%m-%d")
-            train_data_dict[train_index]["weather"] = {}
-            train_data_dict[train_index]["weather"].update({"max_temp": weather_dict[date_key]["max_temp"]})
-            train_data_dict[train_index]["weather"].update({"min_temp": weather_dict[date_key]["min_temp"]})
-            train_data_dict[train_index]["weather"].update({"wea": weather_dict[date_key]["weather"]})
-            train_data_dict[train_index]["weather"].update({"wind": weather_dict[date_key]["wind"]})
-
-            train_data_dict[train_index]["query"].update({"weekday":reqweekday})
-            train_data_dict[train_index]["query"].update({"hour":reqhour})
-
-            o = line[3].split(',')
-            o_first = o[0]
-            o_second = o[1]
-            train_data_dict[train_index]["query"].update({"o1":float(o_first)})
-            train_data_dict[train_index]["query"].update({"o2":float(o_second)})
-
-            d = line[4].split(',')
-            d_first = d[0]
-            d_second = d[1]
-            train_data_dict[train_index]["query"].update({"d1":float(d_first)})
-            train_data_dict[train_index]["query"].update({"d2":float(d_second)})
-
-    plan_map = {}
-    plan_data = pd.read_csv(TEST_PLANS_PATH)
-    for index, row in plan_data.iterrows():
-        plans_str = row['plans']
-        plans_list = json.loads(plans_str)
-        session_id = str(row['sid'])
-        # train_data_dict[session_id]["plans"] = []
-        plan_map[session_id] = plans_list
-
-    profile_map = {}
-    with open(PROFILES_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            profile_map[line[0]] = [i for i in range(len(line)) if line[i] == "1.0"]
-
-    session_click_map = {}
-    with open(TRAIN_CLICK_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            if line[0] == "" or line[1] == "" or line[2] == "":
-                continue
-            session_click_map[line[0]] = line[2]
-    #return train_data_dict, profile_map, session_click_map, plan_map
-    generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map)
-
-
-def generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map):
-    if not os.path.isdir("./out/"):
-        os.mkdir("./out/")
-    with open(os.path.join("./out/", "test_session.txt"), 'w') as f_train:
-        for session_id, plan_list in plan_map.items():
-            if session_id not in train_data_dict:
-                continue
-            cur_map = train_data_dict[session_id]
-            cur_map["session_id"] = session_id
-            if cur_map["pid"] != "":
-                cur_map["profile"] = profile_map[cur_map["pid"]]
-            else:
-                cur_map["profile"] = [0]
-            del cur_map["pid"]
-            whole_rank = 0
-            for plan in plan_list:
-                whole_rank += 1
-                cur_map["mode_rank" + str(whole_rank)] = plan["transport_mode"]
-
-            if whole_rank < 5:
-                for r in range(whole_rank + 1, 6):
-                    cur_map["mode_rank" + str(r)] = -1
-
-            cur_map["whole_rank"] = whole_rank
-            flag_click = False
-            rank = 1
-
-            price_list = []
-            eta_list = []
-            distance_list = []
-            for plan in plan_list:
-                if not plan["price"]:
-                    price_list.append(0)
-                else:
-                    price_list.append(int(plan["price"]))
-                eta_list.append(int(plan["eta"]))
-                distance_list.append(int(plan["distance"]))
-            price_list.sort(reverse=False)
-            eta_list.sort(reverse=False)
-            distance_list.sort(reverse=False)
-
-            for plan in plan_list:
-                if plan["price"] and int(plan["price"]) == price_list[0]:
-                    cur_map["mode_min_price"] = plan["transport_mode"]
-                if plan["price"] and int(plan["price"]) == price_list[-1]:
-                    cur_map["mode_max_price"] = plan["transport_mode"]
-                if int(plan["eta"]) == eta_list[0]:
-                    cur_map["mode_min_eta"] = plan["transport_mode"]
-                if int(plan["eta"]) == eta_list[-1]:
-                    cur_map["mode_max_eta"] = plan["transport_mode"]
-                if int(plan["distance"]) == distance_list[0]:
-                    cur_map["mode_min_distance"] = plan["transport_mode"]
-                if int(plan["distance"]) == distance_list[-1]:
-                    cur_map["mode_max_distance"] = plan["transport_mode"]
-            if "mode_min_price" not in cur_map:
-                cur_map["mode_min_price"] = -1
-            if "mode_max_price" not in cur_map:
-                cur_map["mode_max_price"] = -1
-
-
-            for plan in plan_list:
-                cur_price = int(plan["price"]) if plan["price"] else 0
-                cur_eta = int(plan["eta"])
-                cur_distance = int(plan["distance"])
-                cur_map["price_rank"] = price_list.index(cur_price) + 1
-                cur_map["eta_rank"] = eta_list.index(cur_eta) + 1
-                cur_map["distance_rank"] = distance_list.index(cur_distance) + 1
-
-                if ("transport_mode" in plan) and (session_id in session_click_map) and (
-                        int(plan["transport_mode"]) == int(session_click_map[session_id])):
-                    cur_map["plan"] = plan
-                    cur_map["label"] = 1
-                    flag_click = True
-                    # print("label is 1")
-                else:
-                    cur_map["plan"] = plan
-                    cur_map["label"] = 0
-
-                cur_map["plan_rank"] = rank
-                rank += 1
-                cur_json_instance = json.dumps(cur_map)
-                f_train.write(cur_json_instance + '\n')
-
-            cur_map["plan"]["distance"] = -1
-            cur_map["plan"]["price"] = -1
-            cur_map["plan"]["eta"] = -1
-            cur_map["plan"]["transport_mode"] = 0
-            cur_map["plan_rank"] = 0
-            cur_map["price_rank"] = 0
-            cur_map["eta_rank"] = 0
-            cur_map["plan_rank"] = 0
-            cur_map["label"] = 1
-            cur_json_instance = json.dumps(cur_map)
-            f_train.write(cur_json_instance + '\n')
-
-    build_norm_feature()
-
-
-if __name__ == "__main__":
-    preprocess()
\ No newline at end of file
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/pre_test_dense.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/pre_test_dense.py
deleted file mode 100644
index 58fb483d6395ce3eac236112b648ad86db841e10..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/pre_test_dense.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os, sys, time, random, csv, datetime, json
-import pandas as pd
-import numpy as np
-import argparse
-import logging
-import time
-
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("preprocess")
-logger.setLevel(logging.INFO)
-
-TRAIN_QUERIES_PATH = "./data_set_phase1/test_queries.csv"
-TRAIN_PLANS_PATH = "./data_set_phase1/test_plans.csv"
-TRAIN_CLICK_PATH = "./data_set_phase1/train_clicks.csv"
-PROFILES_PATH = "./data_set_phase1/profiles.csv"
-
-O1_MIN = 115.47
-O1_MAX = 117.29
-
-O2_MIN = 39.46
-O2_MAX = 40.97
-
-D1_MIN = 115.44
-D1_MAX = 117.37
-
-D2_MIN = 39.46
-D2_MAX = 40.96
-
-DISTANCE_MIN = 1.0
-DISTANCE_MAX = 225864.0
-THRESHOLD_DIS = 200000.0
-
-PRICE_MIN = 200.0
-PRICE_MAX = 92300.0
-THRESHOLD_PRICE = 20000
-
-ETA_MIN = 1.0
-ETA_MAX = 72992.0
-THRESHOLD_ETA = 10800.0
-
-
-def build_norm_feature():
-    with open("./out/normed_test_session.txt", 'w') as nf:
-        with open("./out/test_session.txt", 'r') as f:
-            for line in f:
-                cur_map = json.loads(line)
-
-                cur_map["plan"]["distance"] = (cur_map["plan"]["distance"] - DISTANCE_MIN) / (DISTANCE_MAX - DISTANCE_MIN)
-
-                if cur_map["plan"]["price"]:
-                    cur_map["plan"]["price"] = (cur_map["plan"]["price"] - PRICE_MIN) / (PRICE_MAX - PRICE_MIN)
-                else:
-                    cur_map["plan"]["price"] = 0.0
-
-                cur_map["plan"]["eta"] = (cur_map["plan"]["eta"] - ETA_MIN) / (ETA_MAX - ETA_MIN)
-
-                cur_json_instance = json.dumps(cur_map)
-                nf.write(cur_json_instance + '\n')
-
-
-def preprocess():
-    """
-    Construct the train data indexed by session id and mode id jointly. Convert all the raw features (user profile,
-    od pair, req time, click time, eta, price, distance, transport mode) to one-hot ids used for
-    embedding. We split the one-hot features into two categories: user feature and context feature for
-    better understanding of FFM algorithm.
-    Note that the user profile is already provided by one-hot encoded form, we convert it back to the
-    ids for unity with the context feature and easily using of PaddlePaddle embedding layer. Given the
-    train clicks data, we label each train instance with 1 or 0 depend on if this instance is clicked or
-    not.
-    :return:
-    """
-    #args = parse_args()
-
-    train_data_dict = {}
-    with open("./weather.json", 'r') as f:
-        weather_dict = json.load(f)
-
-    with open(TRAIN_QUERIES_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        train_index_list = []
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            if line[0] == "": continue
-            if line[1] == "":
-                train_index_list.append(line[0] + "_0")
-            else:
-                train_index_list.append(line[0] + "_" + line[1])
-
-            train_index = line[0]
-            train_data_dict[train_index] = {}
-            train_data_dict[train_index]["pid"] = line[1]
-            train_data_dict[train_index]["query"] = {}
-
-            reqweekday = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%w")
-            reqhour = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%H")
-
-            date_key = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%m-%d")
-            train_data_dict[train_index]["weather"] = {}
-            train_data_dict[train_index]["weather"].update({"max_temp": weather_dict[date_key]["max_temp"]})
-            train_data_dict[train_index]["weather"].update({"min_temp": weather_dict[date_key]["min_temp"]})
-            train_data_dict[train_index]["weather"].update({"wea": weather_dict[date_key]["weather"]})
-            train_data_dict[train_index]["weather"].update({"wind": weather_dict[date_key]["wind"]})
-
-            train_data_dict[train_index]["query"].update({"weekday":reqweekday})
-            train_data_dict[train_index]["query"].update({"hour":reqhour})
-
-            o = line[3].split(',')
-            o_first = o[0]
-            o_second = o[1]
-            train_data_dict[train_index]["query"].update({"o1":float(o_first)})
-            train_data_dict[train_index]["query"].update({"o2":float(o_second)})
-
-            d = line[4].split(',')
-            d_first = d[0]
-            d_second = d[1]
-            train_data_dict[train_index]["query"].update({"d1":float(d_first)})
-            train_data_dict[train_index]["query"].update({"d2":float(d_second)})
-
-    plan_map = {}
-    plan_data = pd.read_csv(TRAIN_PLANS_PATH)
-    for index, row in plan_data.iterrows():
-        plans_str = row['plans']
-        plans_list = json.loads(plans_str)
-        session_id = str(row['sid'])
-        # train_data_dict[session_id]["plans"] = []
-        plan_map[session_id] = plans_list
-
-    profile_map = {}
-    with open(PROFILES_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            profile_map[line[0]] = [i for i in range(len(line)) if line[i] == "1.0"]
-
-    session_click_map = {}
-    with open(TRAIN_CLICK_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            if line[0] == "" or line[1] == "" or line[2] == "":
-                continue
-            session_click_map[line[0]] = line[2]
-    #return train_data_dict, profile_map, session_click_map, plan_map
-    generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map)
-
-
-def generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map):
-    if not os.path.isdir("./out/"):
-        os.mkdir("./out/")
-    with open(os.path.join("./out/", "test_session.txt"), 'w') as f_train:
-        for session_id, plan_list in plan_map.items():
-            if session_id not in train_data_dict:
-                continue
-            cur_map = train_data_dict[session_id]
-            cur_map["session_id"] = session_id
-            if cur_map["pid"] != "":
-                cur_map["profile"] = profile_map[cur_map["pid"]]
-            else:
-                cur_map["profile"] = [0]
-            # del cur_map["pid"]
-            whole_rank = 0
-            for plan in plan_list:
-                whole_rank += 1
-                cur_map["mode_rank" + str(whole_rank)] = plan["transport_mode"]
-
-            if whole_rank < 5:
-                for r in range(whole_rank + 1, 6):
-                    cur_map["mode_rank" + str(r)] = -1
-
-            cur_map["whole_rank"] = whole_rank
-            rank = 1
-
-            price_list = []
-            eta_list = []
-            distance_list = []
-            for plan in plan_list:
-                if not plan["price"]:
-                    price_list.append(0)
-                else:
-                    price_list.append(int(plan["price"]))
-                eta_list.append(int(plan["eta"]))
-                distance_list.append(int(plan["distance"]))
-            price_list.sort(reverse=False)
-            eta_list.sort(reverse=False)
-            distance_list.sort(reverse=False)
-
-            for plan in plan_list:
-                if plan["price"] and int(plan["price"]) == price_list[0]:
-                    cur_map["mode_min_price"] = plan["transport_mode"]
-                if plan["price"] and int(plan["price"]) == price_list[-1]:
-                    cur_map["mode_max_price"] = plan["transport_mode"]
-                if int(plan["eta"]) == eta_list[0]:
-                    cur_map["mode_min_eta"] = plan["transport_mode"]
-                if int(plan["eta"]) == eta_list[-1]:
-                    cur_map["mode_max_eta"] = plan["transport_mode"]
-                if int(plan["distance"]) == distance_list[0]:
-                    cur_map["mode_min_distance"] = plan["transport_mode"]
-                if int(plan["distance"]) == distance_list[-1]:
-                    cur_map["mode_max_distance"] = plan["transport_mode"]
-            if "mode_min_price" not in cur_map:
-                cur_map["mode_min_price"] = -1
-            if "mode_max_price" not in cur_map:
-                cur_map["mode_max_price"] = -1
-
-            for plan in plan_list:
-                cur_price = int(plan["price"]) if plan["price"] else 0
-                cur_eta = int(plan["eta"])
-                cur_distance = int(plan["distance"])
-                cur_map["price_rank"] = price_list.index(cur_price) + 1
-                cur_map["eta_rank"] = eta_list.index(cur_eta) + 1
-                cur_map["distance_rank"] = distance_list.index(cur_distance) + 1
-
-                if ("transport_mode" in plan) and (session_id in session_click_map) and (
-                        int(plan["transport_mode"]) == int(session_click_map[session_id])):
-                    cur_map["plan"] = plan
-                    cur_map["label"] = 1
-                else:
-                    cur_map["plan"] = plan
-                    cur_map["label"] = 0
-
-                cur_map["plan_rank"] = rank
-                rank += 1
-                cur_json_instance = json.dumps(cur_map)
-                f_train.write(cur_json_instance + '\n')
-
-            cur_map["plan"]["distance"] = -1
-            cur_map["plan"]["price"] = -1
-            cur_map["plan"]["eta"] = -1
-            cur_map["plan"]["transport_mode"] = 0
-            cur_map["plan_rank"] = 0
-            cur_map["price_rank"] = 0
-            cur_map["eta_rank"] = 0
-            cur_map["plan_rank"] = 0
-            cur_map["label"] = 1
-            cur_json_instance = json.dumps(cur_map)
-            f_train.write(cur_json_instance + '\n')
-
-
-    build_norm_feature()
-
-
-if __name__ == "__main__":
-    preprocess()
\ No newline at end of file
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/preprocess.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/preprocess.py
deleted file mode 100644
index 8d61ae55314b860e3d0b1091ed7616b6ee054962..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/preprocess.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys, time, random, csv, datetime, json
-import pandas as pd
-import numpy as np
-import argparse
-import logging
-import time
-
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("preprocess")
-logger.setLevel(logging.INFO)
-
-TRAIN_QUERIES_PATH = "./data_set_phase1/train_queries.csv"
-TRAIN_PLANS_PATH = "./data_set_phase1/train_plans.csv"
-TRAIN_CLICK_PATH = "./data_set_phase1/train_clicks.csv"
-PROFILES_PATH = "./data_set_phase1/profiles.csv"
-OUT_NORM_TRAIN_PATH = "./out/normed_train.txt"
-OUT_RAW_TRAIN_PATH = "./out/train.txt"
-
-OUT_DIR = "./out"
-
-
-O1_MIN = 115.47
-O1_MAX = 117.29
-
-O2_MIN = 39.46
-O2_MAX = 40.97
-
-D1_MIN = 115.44
-D1_MAX = 117.37
-
-D2_MIN = 39.46
-D2_MAX = 40.96
-SCALE_OD = 0.02
-
-DISTANCE_MIN = 1.0
-DISTANCE_MAX = 225864.0
-THRESHOLD_DIS = 40000.0
-SCALE_DIS = 500
-
-PRICE_MIN = 200.0
-PRICE_MAX = 92300.0
-THRESHOLD_PRICE = 20000
-SCALE_PRICE = 100
-
-ETA_MIN = 1.0
-ETA_MAX = 72992.0
-THRESHOLD_ETA = 10800.0
-SCALE_ETA = 120
-
-
-def build_norm_feature():
-    with open(OUT_NORM_TRAIN_PATH, 'w') as nf:
-        with open(OUT_RAW_TRAIN_PATH, 'r') as f:
-            for line in f:
-                cur_map = json.loads(line)
-
-                if cur_map["plan"]["distance"] > THRESHOLD_DIS:
-                    cur_map["plan"]["distance"] = int(THRESHOLD_DIS)
-                elif cur_map["plan"]["distance"] > 0:
-                    cur_map["plan"]["distance"] = int(cur_map["plan"]["distance"] / SCALE_DIS)
-
-                if cur_map["plan"]["price"] and cur_map["plan"]["price"] > THRESHOLD_PRICE:
-                    cur_map["plan"]["price"] = int(THRESHOLD_PRICE)
-                elif not cur_map["plan"]["price"] or cur_map["plan"]["price"] < 0:
-                    cur_map["plan"]["price"] = 0
-                else:
-                    cur_map["plan"]["price"] = int(cur_map["plan"]["price"] / SCALE_PRICE)
-
-                if cur_map["plan"]["eta"] > THRESHOLD_ETA:
-                    cur_map["plan"]["eta"] = int(THRESHOLD_ETA)
-                elif cur_map["plan"]["eta"] > 0:
-                    cur_map["plan"]["eta"] = int(cur_map["plan"]["eta"] / SCALE_ETA)
-
-                # o1
-                if cur_map["query"]["o1"] > O1_MAX:
-                    cur_map["query"]["o1"] = int((O1_MAX - O1_MIN) / SCALE_OD + 1)
-                elif cur_map["query"]["o1"] < O1_MIN:
-                    cur_map["query"]["o1"] = 0
-                else:
-                    cur_map["query"]["o1"] = int((cur_map["query"]["o1"] - O1_MIN) / 0.02)
-
-                # o2
-                if cur_map["query"]["o2"] > O2_MAX:
-                    cur_map["query"]["o2"] = int((O2_MAX - O2_MIN) / SCALE_OD + 1)
-                elif cur_map["query"]["o2"] < O2_MIN:
-                    cur_map["query"]["o2"] = 0
-                else:
-                    cur_map["query"]["o2"] = int((cur_map["query"]["o2"] - O2_MIN) / 0.02)
-
-                # d1
-                if cur_map["query"]["d1"] > D1_MAX:
-                    cur_map["query"]["d1"] = int((D1_MAX - D1_MIN) / SCALE_OD + 1)
-                elif cur_map["query"]["d1"] < D1_MIN:
-                    cur_map["query"]["d1"] = 0
-                else:
-                    cur_map["query"]["d1"] = int((cur_map["query"]["d1"] - D1_MIN) / SCALE_OD)
-
-                # d2
-                if cur_map["query"]["d2"] > D2_MAX:
-                    cur_map["query"]["d2"] = int((D2_MAX - D2_MIN) / SCALE_OD + 1)
-                elif cur_map["query"]["d2"] < D2_MIN:
-                    cur_map["query"]["d2"] = 0
-                else:
-                    cur_map["query"]["d2"] = int((cur_map["query"]["d2"] - D2_MIN) / SCALE_OD)
-
-                cur_json_instance = json.dumps(cur_map)
-                nf.write(cur_json_instance + '\n')
-
-
-def preprocess():
-    """
-    Construct the train data indexed by session id and mode id jointly. Convert all the raw features (user profile,
-    od pair, req time, click time, eta, price, distance, transport mode) to one-hot ids used for
-    embedding. We split the one-hot features into two categories: user feature and context feature for
-    better understanding of FM algorithm.
-    Note that the user profile is already provided by one-hot encoded form, we treat it as embedded vector
-    for unity with the context feature and easily using of PaddlePaddle embedding layer. Given the
-    train clicks data, we label each train instance with 1 or 0 depend on if this instance is clicked or
-    not include non-click case.
-    :return:
-    """
-
-    train_data_dict = {}
-    with open(TRAIN_QUERIES_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        train_index_list = []
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            if line[0] == "": continue
-            if line[1] == "":
-                train_index_list.append(line[0] + "_0")
-            else:
-                train_index_list.append(line[0] + "_" + line[1])
-
-            train_index = line[0]
-            train_data_dict[train_index] = {}
-            train_data_dict[train_index]["pid"] = line[1]
-            train_data_dict[train_index]["query"] = {}
-
-            reqweekday = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%w")
-            reqhour = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%H")
-
-            train_data_dict[train_index]["query"].update({"weekday":reqweekday})
-            train_data_dict[train_index]["query"].update({"hour":reqhour})
-
-            o = line[3].split(',')
-            o_first = o[0]
-            o_second = o[1]
-            train_data_dict[train_index]["query"].update({"o1":float(o_first)})
-            train_data_dict[train_index]["query"].update({"o2":float(o_second)})
-
-            d = line[4].split(',')
-            d_first = d[0]
-            d_second = d[1]
-            train_data_dict[train_index]["query"].update({"d1":float(d_first)})
-            train_data_dict[train_index]["query"].update({"d2":float(d_second)})
-
-    plan_map = {}
-    plan_data = pd.read_csv(TRAIN_PLANS_PATH)
-    for index, row in plan_data.iterrows():
-        plans_str = row['plans']
-        plans_list = json.loads(plans_str)
-        session_id = str(row['sid'])
-        # train_data_dict[session_id]["plans"] = []
-        plan_map[session_id] = plans_list
-
-    profile_map = {}
-    with open(PROFILES_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            profile_map[line[0]] = [i for i in range(len(line)) if line[i] == "1.0"]
-
-    session_click_map = {}
-    with open(TRAIN_CLICK_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            if line[0] == "" or line[1] == "" or line[2] == "":
-                continue
-            session_click_map[line[0]] = line[2]
-    #return train_data_dict, profile_map, session_click_map, plan_map
-    generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map)
-
-
-def generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map):
-    if not os.path.isdir(OUT_DIR):
-        os.mkdir(OUT_DIR)
-    with open(os.path.join("./out/", "train.txt"), 'w') as f_train:
-        for session_id, plan_list in plan_map.items():
-            if session_id not in train_data_dict:
-                continue
-            cur_map = train_data_dict[session_id]
-            if cur_map["pid"] != "":
-                cur_map["profile"] = profile_map[cur_map["pid"]]
-            else:
-                cur_map["profile"] = [0]
-            del cur_map["pid"]
-            whole_rank = 0
-            for plan in plan_list:
-                whole_rank += 1
-            cur_map["whole_rank"] = whole_rank
-            flag_click = False
-            rank = 1
-
-
-            for plan in plan_list:
-
-                if ("transport_mode" in plan) and (session_id in session_click_map) and (
-                        int(plan["transport_mode"]) == int(session_click_map[session_id])):
-                    cur_map["plan"] = plan
-                    cur_map["label"] = 1
-                    flag_click = True
-                    # print("label is 1")
-                else:
-                    cur_map["plan"] = plan
-                    cur_map["label"] = 0
-
-                cur_map["rank"] = rank
-                rank += 1
-                cur_json_instance = json.dumps(cur_map)
-                f_train.write(cur_json_instance + '\n')
-            if not flag_click:
-                cur_map["plan"]["distance"] = -1
-                cur_map["plan"]["price"] = -1
-                cur_map["plan"]["eta"] = -1
-                cur_map["plan"]["transport_mode"] = 0
-                cur_map["rank"] = 0
-                cur_map["label"] = 1
-                cur_json_instance = json.dumps(cur_map)
-                f_train.write(cur_json_instance + '\n')
-            else:
-                cur_map["plan"]["distance"] = -1
-                cur_map["plan"]["price"] = -1
-                cur_map["plan"]["eta"] = -1
-                cur_map["plan"]["transport_mode"] = 0
-                cur_map["rank"] = 0
-                cur_map["label"] = 0
-                cur_json_instance = json.dumps(cur_map)
-                f_train.write(cur_json_instance + '\n')
-
-
-    build_norm_feature()
-
-
-if __name__ == "__main__":
-    preprocess()
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/preprocess_dense.py b/PaddleRec/ctr/Paddle_baseline_KDD2019/preprocess_dense.py
deleted file mode 100644
index 10d674c94bb674abb92aa8f2b49c51551c4a976a..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/preprocess_dense.py
+++ /dev/null
@@ -1,294 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, random, csv, datetime, json
-import pandas as pd
-import numpy as np
-import argparse
-import logging
-import time
-
-logging.basicConfig(
-    format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("preprocess")
-logger.setLevel(logging.INFO)
-
-TRAIN_QUERIES_PATH = "./data_set_phase1/train_queries.csv"
-TRAIN_PLANS_PATH = "./data_set_phase1/train_plans.csv"
-TRAIN_CLICK_PATH = "./data_set_phase1/train_clicks.csv"
-PROFILES_PATH = "./data_set_phase1/profiles.csv"
-
-OUT_DIR = "./out"
-ORI_TRAIN_PATH = "train.txt"
-NORM_TRAIN_PATH = "normed_train.txt"
-#variable to control the ratio of positive and negative instance of transmode 0 which is original label of no click
-THRESHOLD_LABEL = 0.5
-
-
-
-O1_MIN = 115.47
-O1_MAX = 117.29
-
-O2_MIN = 39.46
-O2_MAX = 40.97
-
-D1_MIN = 115.44
-D1_MAX = 117.37
-
-D2_MIN = 39.46
-D2_MAX = 40.96
-
-DISTANCE_MIN = 1.0
-DISTANCE_MAX = 225864.0
-THRESHOLD_DIS = 200000.0
-
-PRICE_MIN = 200.0
-PRICE_MAX = 92300.0
-THRESHOLD_PRICE = 20000
-
-ETA_MIN = 1.0
-ETA_MAX = 72992.0
-THRESHOLD_ETA = 10800.0
-
-
-def build_norm_feature():
-    with open(os.path.join(OUT_DIR, NORM_TRAIN_PATH), 'w') as nf:
-        with open(os.path.join(OUT_DIR, ORI_TRAIN_PATH), 'r') as f:
-            for line in f:
-                cur_map = json.loads(line)
-
-                cur_map["plan"]["distance"] = (cur_map["plan"]["distance"] - DISTANCE_MIN) / (DISTANCE_MAX - DISTANCE_MIN)
-
-                if cur_map["plan"]["price"]:
-                    cur_map["plan"]["price"] = (cur_map["plan"]["price"] - PRICE_MIN) / (PRICE_MAX - PRICE_MIN)
-                else:
-                    cur_map["plan"]["price"] = 0.0
-
-                cur_map["plan"]["eta"] = (cur_map["plan"]["eta"] - ETA_MIN) / (ETA_MAX - ETA_MIN)
-
-                cur_json_instance = json.dumps(cur_map)
-                nf.write(cur_json_instance + '\n')
-
-
-def preprocess():
-    """
-    Construct the train data indexed by session id and mode id jointly. Convert all the raw features (user profile,
-    od pair, req time, click time, eta, price, distance, transport mode) to one-hot ids used for
-    embedding. We split the one-hot features into two categories: user feature and context feature for
-    better understanding of FM algorithm.
-    Note that the user profile is already provided by one-hot encoded form, we treat it as embedded vector
-    for unity with the context feature and easily using of PaddlePaddle embedding layer. Given the
-    train clicks data, we label each train instance with 1 or 0 depend on if this instance is clicked or
-    not include non-click case. To Be Changed
-    :return:
-    """
-
-    train_data_dict = {}
-
-    with open("./weather.json", 'r') as f:
-        weather_dict = json.load(f)
-
-    with open(TRAIN_QUERIES_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        train_index_list = []
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            if line[0] == "": continue
-            if line[1] == "":
-                train_index_list.append(line[0] + "_0")
-            else:
-                train_index_list.append(line[0] + "_" + line[1])
-
-            train_index = line[0]
-            train_data_dict[train_index] = {}
-            train_data_dict[train_index]["pid"] = line[1]
-            train_data_dict[train_index]["query"] = {}
-            train_data_dict[train_index]["weather"] = {}
-
-            reqweekday = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%w")
-            reqhour = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%H")
-
-            # weather related features, no big use, maybe more detailed weather information is better
-            date_key = datetime.datetime.strptime(line[2], '%Y-%m-%d %H:%M:%S').strftime("%m-%d")
-            train_data_dict[train_index]["weather"] = {}
-            train_data_dict[train_index]["weather"].update({"max_temp": weather_dict[date_key]["max_temp"]})
-            train_data_dict[train_index]["weather"].update({"min_temp": weather_dict[date_key]["min_temp"]})
-            train_data_dict[train_index]["weather"].update({"wea": weather_dict[date_key]["weather"]})
-            train_data_dict[train_index]["weather"].update({"wind": weather_dict[date_key]["wind"]})
-
-            train_data_dict[train_index]["query"].update({"weekday":reqweekday})
-            train_data_dict[train_index]["query"].update({"hour":reqhour})
-
-            o = line[3].split(',')
-            o_first = o[0]
-            o_second = o[1]
-            train_data_dict[train_index]["query"].update({"o1":float(o_first)})
-            train_data_dict[train_index]["query"].update({"o2":float(o_second)})
-
-            d = line[4].split(',')
-            d_first = d[0]
-            d_second = d[1]
-            train_data_dict[train_index]["query"].update({"d1":float(d_first)})
-            train_data_dict[train_index]["query"].update({"d2":float(d_second)})
-
-    plan_map = {}
-    plan_data = pd.read_csv(TRAIN_PLANS_PATH)
-    for index, row in plan_data.iterrows():
-        plans_str = row['plans']
-        plans_list = json.loads(plans_str)
-        session_id = str(row['sid'])
-        # train_data_dict[session_id]["plans"] = []
-        plan_map[session_id] = plans_list
-
-    profile_map = {}
-    with open(PROFILES_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            profile_map[line[0]] = [i for i in range(len(line)) if line[i] == "1.0"]
-
-    session_click_map = {}
-    with open(TRAIN_CLICK_PATH, 'r') as f:
-        csv_reader = csv.reader(f, delimiter=',')
-        for k, line in enumerate(csv_reader):
-            if k == 0: continue
-            if line[0] == "" or line[1] == "" or line[2] == "":
-                continue
-            session_click_map[line[0]] = line[2]
-    #return train_data_dict, profile_map, session_click_map, plan_map
-    generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map)
-
-
-def generate_sparse_features(train_data_dict, profile_map, session_click_map, plan_map):
-    if not os.path.isdir(OUT_DIR):
-        os.mkdir(OUT_DIR)
-    with open(os.path.join(OUT_DIR, ORI_TRAIN_PATH), 'w') as f_train:
-        for session_id, plan_list in plan_map.items():
-            if session_id not in train_data_dict:
-                continue
-            cur_map = train_data_dict[session_id]
-            if cur_map["pid"] != "":
-                cur_map["profile"] = profile_map[cur_map["pid"]]
-            else:
-                cur_map["profile"] = [0]
-            
-            #rank information related feature 
-            whole_rank = 0
-            for plan in plan_list:
-                whole_rank += 1
-                cur_map["mode_rank" + str(whole_rank)] = plan["transport_mode"]
-
-            if whole_rank < 5:
-                for r in range(whole_rank + 1, 6):
-                    cur_map["mode_rank" + str(r)] = -1
-
-            cur_map["whole_rank"] = whole_rank
-            flag_click = False
-            rank = 1
-
-            price_list = []
-            eta_list = []
-            distance_list = []
-            for plan in plan_list:
-                if not plan["price"]:
-                    price_list.append(0)
-                else:
-                    price_list.append(int(plan["price"]))
-                eta_list.append(int(plan["eta"]))
-                distance_list.append(int(plan["distance"]))
-            price_list.sort(reverse=False)
-            eta_list.sort(reverse=False)
-            distance_list.sort(reverse=False)
-
-            for plan in plan_list:
-                if plan["price"] and int(plan["price"]) == price_list[0]:
-                    cur_map["mode_min_price"] = plan["transport_mode"]
-                if plan["price"] and int(plan["price"]) == price_list[-1]:
-                    cur_map["mode_max_price"] = plan["transport_mode"]
-                if int(plan["eta"]) == eta_list[0]:
-                    cur_map["mode_min_eta"] = plan["transport_mode"]
-                if int(plan["eta"]) == eta_list[-1]:
-                    cur_map["mode_max_eta"] = plan["transport_mode"]
-                if int(plan["distance"]) == distance_list[0]:
-                    cur_map["mode_min_distance"] = plan["transport_mode"]
-                if int(plan["distance"]) == distance_list[-1]:
-                    cur_map["mode_max_distance"] = plan["transport_mode"]
-            if "mode_min_price" not in cur_map:
-                cur_map["mode_min_price"] = -1
-            if "mode_max_price" not in cur_map:
-                cur_map["mode_max_price"] = -1
-
-            for plan in plan_list:
-                if ("transport_mode" in plan) and (session_id in session_click_map) and (
-                        int(plan["transport_mode"]) == int(session_click_map[session_id])):
-                    flag_click = True
-            if flag_click:
-
-                for plan in plan_list:
-                    cur_price = int(plan["price"]) if plan["price"] else 0
-                    cur_eta = int(plan["eta"])
-                    cur_distance = int(plan["distance"])
-                    cur_map["price_rank"] = price_list.index(cur_price) + 1
-                    cur_map["eta_rank"] = eta_list.index(cur_eta) + 1
-                    cur_map["distance_rank"] = distance_list.index(cur_distance) + 1
-
-                    if ("transport_mode" in plan) and (session_id in session_click_map) and (
-                            int(plan["transport_mode"]) == int(session_click_map[session_id])):
-                        cur_map["plan"] = plan
-                        cur_map["label"] = 1
-                    else:
-                        cur_map["plan"] = plan
-                        cur_map["label"] = 0
-
-                    cur_map["plan_rank"] = rank
-                    rank += 1
-                    cur_json_instance = json.dumps(cur_map)
-                    f_train.write(cur_json_instance + '\n')
-            
-            cur_map["plan"] = {}
-            #since we define a new ctr task from original task, we use a basic way to generate instances of transport mode 0.
-            #There should be a optimal strategy to generate instances of transport mode 0
-            if not flag_click:
-                cur_map["plan"]["distance"] = -1
-                cur_map["plan"]["price"] = -1
-                cur_map["plan"]["eta"] = -1
-                cur_map["plan"]["transport_mode"] = 0
-                cur_map["plan_rank"] = 0
-                cur_map["price_rank"] = 0
-                cur_map["eta_rank"] = 0
-                cur_map["distance_rank"] = 0
-                cur_map["label"] = 1
-                cur_json_instance = json.dumps(cur_map)
-                f_train.write(cur_json_instance + '\n')
-            else:
-                if random.random() < THRESHOLD_LABEL:
-                    cur_map["plan"]["distance"] = -1
-                    cur_map["plan"]["price"] = -1
-                    cur_map["plan"]["eta"] = -1
-                    cur_map["plan"]["transport_mode"] = 0
-                    cur_map["plan_rank"] = 0
-                    cur_map["price_rank"] = 0
-                    cur_map["eta_rank"] = 0
-                    cur_map["distance_rank"] = 0
-                    cur_map["label"] = 0
-                    cur_json_instance = json.dumps(cur_map)
-                    f_train.write(cur_json_instance + '\n')
-
-
-
-    build_norm_feature()
-
-
-if __name__ == "__main__":
-    preprocess()
diff --git a/PaddleRec/ctr/Paddle_baseline_KDD2019/weather.json b/PaddleRec/ctr/Paddle_baseline_KDD2019/weather.json
deleted file mode 100644
index 0d0f17f93b637a4d6d79b987d119bdb4dbab933f..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/Paddle_baseline_KDD2019/weather.json
+++ /dev/null
@@ -1 +0,0 @@
-{"10-01": {"max_temp": "24", "min_temp": "12", "weather": "q", "wind": "45"}, "10-02": {"max_temp": "24", "min_temp": "11", "weather": "q", "wind": "12"}, "10-03": {"max_temp": "25", "min_temp": "10", "weather": "q", "wind": "12"}, "10-04": {"max_temp": "25", "min_temp": "12", "weather": "q", "wind": "12"}, "10-05": {"max_temp": "24", "min_temp": "14", "weather": "dy", "wind": "12"}, "10-06": {"max_temp": "20", "min_temp": "8", "weather": "q", "wind": "45"}, "10-07": {"max_temp": "21", "min_temp": "7", "weather": "q", "wind": "12"}, "10-08": {"max_temp": "21", "min_temp": "8", "weather": "dy", "wind": "12"}, "10-09": {"max_temp": "15", "min_temp": "4", "weather": "dyq", "wind": "45"}, "10-10": {"max_temp": "17", "min_temp": "4", "weather": "dyq", "wind": "12"}, "10-11": {"max_temp": "18", "min_temp": "5", "weather": "qdy", "wind": "12"}, "10-12": {"max_temp": "20", "min_temp": "5", "weather": "dyq", "wind": "12"}, "10-13": {"max_temp": "20", "min_temp": "8", "weather": "dy", "wind": "12"}, "10-14": {"max_temp": "21", "min_temp": "10", "weather": "dy", "wind": "12"}, "10-15": {"max_temp": "17", "min_temp": "11", "weather": "xq", "wind": "12"}, "10-16": {"max_temp": "17", "min_temp": "7", "weather": "dyq", "wind": "12"}, "10-17": {"max_temp": "17", "min_temp": "5", "weather": "q", "wind": "12"}, "10-18": {"max_temp": "18", "min_temp": "5", "weather": "q", "wind": "12"}, "10-19": {"max_temp": "19", "min_temp": "7", "weather": "dy", "wind": "12"}, "10-20": {"max_temp": "18", "min_temp": "7", "weather": "dy", "wind": "12"}, "10-21": {"max_temp": "18", "min_temp": "7", "weather": "dy", "wind": "12"}, "10-22": {"max_temp": "19", "min_temp": "5", "weather": "dyq", "wind": "12"}, "10-23": {"max_temp": "19", "min_temp": "4", "weather": "q", "wind": "34"}, "10-24": {"max_temp": "20", "min_temp": "6", "weather": "qdy", "wind": "12"}, "10-25": {"max_temp": "15", "min_temp": "8", "weather": "dy", "wind": "12"}, "10-26": {"max_temp": "14", "min_temp": "3", "weather": "q", "wind": "45"}, "10-27": {"max_temp": "17", "min_temp": "5", "weather": "dy", "wind": "12"}, "10-28": {"max_temp": "17", "min_temp": "4", "weather": "dyq", "wind": "45"}, "10-29": {"max_temp": "15", "min_temp": "3", "weather": "q", "wind": "34"}, "10-30": {"max_temp": "16", "min_temp": "1", "weather": "q", "wind": "12"}, "10-31": {"max_temp": "17", "min_temp": "3", "weather": "q", "wind": "12"}, "11-01": {"max_temp": "17", "min_temp": "3", "weather": "q", "wind": "12"}, "11-02": {"max_temp": "18", "min_temp": "4", "weather": "q", "wind": "12"}, "11-03": {"max_temp": "16", "min_temp": "6", "weather": "dy", "wind": "12"}, "11-04": {"max_temp": "10", "min_temp": "2", "weather": "xydy", "wind": "34"}, "11-05": {"max_temp": "10", "min_temp": "2", "weather": "dy", "wind": "12"}, "11-06": {"max_temp": "12", "min_temp": "0", "weather": "dy", "wind": "12"}, "11-07": {"max_temp": "13", "min_temp": "3", "weather": "dy", "wind": "12"}, "11-08": {"max_temp": "14", "min_temp": "2", "weather": "dy", "wind": "12"}, "11-09": {"max_temp": "15", "min_temp": "1", "weather": "qdy", "wind": "34"}, "11-10": {"max_temp": "11", "min_temp": "0", "weather": "dy", "wind": "12"}, "11-11": {"max_temp": "13", "min_temp": "1", "weather": "dyq", "wind": "12"}, "11-12": {"max_temp": "14", "min_temp": "2", "weather": "q", "wind": "12"}, "11-13": {"max_temp": "13", "min_temp": "5", "weather": "dy", "wind": "12"}, "11-14": {"max_temp": "13", "min_temp": "5", "weather": "dy", "wind": "12"}, "11-15": {"max_temp": "8", "min_temp": "1", "weather": "xydy", "wind": "34"}, "11-16": {"max_temp": "8", "min_temp": "-1", "weather": "q", "wind": "12"}, "11-17": {"max_temp": "9", "min_temp": "-2", "weather": "dyq", "wind": "12"}, "11-18": {"max_temp": "11", "min_temp": "-3", "weather": "q", "wind": "34"}, "11-19": {"max_temp": "10", "min_temp": "-2", "weather": "qdy", "wind": "12"}, "11-20": {"max_temp": "9", "min_temp": "-1", "weather": "dy", "wind": "12"}, "11-21": {"max_temp": "9", "min_temp": "-3", "weather": "q", "wind": "2"}, "11-22": {"max_temp": "8", "min_temp": "-3", "weather": "qdy", "wind": "1"}, "11-23": {"max_temp": "7", "min_temp": "0", "weather": "dy", "wind": "2"}, "11-24": {"max_temp": "9", "min_temp": "-3", "weather": "qdy", "wind": "2"}, "11-25": {"max_temp": "10", "min_temp": "-3", "weather": "q", "wind": "1"}, "11-26": {"max_temp": "10", "min_temp": "0", "weather": "dy", "wind": "1"}, "11-27": {"max_temp": "9", "min_temp": "-3", "weather": "qdy", "wind": "2"}, "11-28": {"max_temp": "8", "min_temp": "-3", "weather": "q", "wind": "1"}, "11-29": {"max_temp": "7", "min_temp": "-4", "weather": "q", "wind": "1"}, "11-30": {"max_temp": "8", "min_temp": "-3", "weather": "q", "wind": "1"}, "12-01": {"max_temp": "7", "min_temp": "0", "weather": "dy", "wind": "1"}, "12-02": {"max_temp": "9", "min_temp": "2", "weather": "dy", "wind": "1"}, "12-03": {"max_temp": "8", "min_temp": "-3", "weather": "dyq", "wind": "3"}, "12-04": {"max_temp": "4", "min_temp": "-6", "weather": "qdy", "wind": "2"}, "12-05": {"max_temp": "1", "min_temp": "-4", "weather": "dy", "wind": "1"}, "12-06": {"max_temp": "-2", "min_temp": "-9", "weather": "q", "wind": "3"}, "12-07": {"max_temp": "-4", "min_temp": "-10", "weather": "q", "wind": "3"}, "12-08": {"max_temp": "-2", "min_temp": "-10", "weather": "qdy", "wind": "2"}, "12-09": {"max_temp": "-1", "min_temp": "-10", "weather": "dyq", "wind": "1"}}
\ No newline at end of file
diff --git a/PaddleRec/ctr/dcn/cluster_train.py b/PaddleRec/ctr/dcn/cluster_train.py
deleted file mode 100644
index e791727b30db7e3ed6cefcd558e00b95b4e427fe..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/dcn/cluster_train.py
+++ /dev/null
@@ -1,205 +0,0 @@
-import argparse
-import os
-import sys
-import time
-from collections import OrderedDict
-
-import paddle.fluid as fluid
-
-from network import DCN
-import utils
-
-
-def boolean_string(s):
-    if s.lower() not in {'false', 'true'}:
-        raise ValueError('Not a valid boolean string')
-    return s.lower() == 'true'
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("dcn cluster train.")
-    parser.add_argument(
-        '--train_data_dir',
-        type=str,
-        default='dist_data/dist_train_data',
-        help='The path of train data')
-    parser.add_argument(
-        '--test_valid_data_dir',
-        type=str,
-        default='dist_data/dist_test_valid_data',
-        help='The path of test and valid data')
-    parser.add_argument(
-        '--vocab_dir',
-        type=str,
-        default='dist_data/vocab',
-        help='The path of generated vocabs')
-    parser.add_argument(
-        '--cat_feat_num',
-        type=str,
-        default='dist_data/cat_feature_num.txt',
-        help='The path of generated cat_feature_num.txt')
-    parser.add_argument(
-        '--batch_size', type=int, default=512, help="Batch size")
-    parser.add_argument('--num_epoch', type=int, default=10, help="train epoch")
-    parser.add_argument(
-        '--model_output_dir',
-        type=str,
-        default='models',
-        help='The path for model to store')
-    parser.add_argument(
-        '--num_thread', type=int, default=1, help='The number of threads')
-    parser.add_argument('--test_epoch', type=str, default='1')
-    parser.add_argument(
-        '--dnn_hidden_units',
-        nargs='+',
-        type=int,
-        default=[1024, 1024],
-        help='DNN layers and hidden units')
-    parser.add_argument(
-        '--cross_num',
-        type=int,
-        default=6,
-        help='The number of Cross network layers')
-    parser.add_argument('--lr', type=float, default=1e-4, help='Learning rate')
-    parser.add_argument(
-        '--l2_reg_cross',
-        type=float,
-        default=1e-5,
-        help='Cross net l2 regularizer coefficient')
-    parser.add_argument(
-        '--use_bn',
-        type=boolean_string,
-        default=True,
-        help='Whether use batch norm in dnn part')
-    parser.add_argument(
-        '--is_sparse',
-        action='store_true',
-        required=False,
-        default=False,
-        help='embedding will use sparse or not, (default: False)')
-    parser.add_argument(
-        '--clip_by_norm', type=float, default=100.0, help="gradient clip norm")
-    parser.add_argument('--print_steps', type=int, default=5)
-    parser.add_argument('--use_gpu', type=int, default=1)
-
-    # dist params
-    parser.add_argument('--is_local', type=int, default=1, help='whether local')
-    parser.add_argument(
-        '--num_devices', type=int, default=1, help='Number of GPU devices')
-    parser.add_argument(
-        '--role', type=str, default='pserver', help='trainer or pserver')
-    parser.add_argument(
-        '--endpoints',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The current_endpoint')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='trainer id ,only trainer_id=0 save model')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trianers, (default: 1)')
-    args = parser.parse_args()
-    return args
-
-
-def train():
-    """ do training """
-    args = parse_args()
-    print(args)
-
-    if args.trainer_id == 0 and not os.path.isdir(args.model_output_dir):
-        os.mkdir(args.model_output_dir)
-
-    cat_feat_dims_dict = OrderedDict()
-    for line in open(args.cat_feat_num):
-        spls = line.strip().split()
-        assert len(spls) == 2
-        cat_feat_dims_dict[spls[0]] = int(spls[1])
-
-    dcn_model = DCN(args.cross_num, args.dnn_hidden_units, args.l2_reg_cross,
-                    args.use_bn, args.clip_by_norm, cat_feat_dims_dict,
-                    args.is_sparse)
-    dcn_model.build_network()
-    optimizer = fluid.optimizer.Adam(learning_rate=args.lr)
-    optimizer.minimize(dcn_model.loss)
-
-    def train_loop(main_program):
-        """ train network """
-        start_time = time.time()
-        dataset = fluid.DatasetFactory().create_dataset()
-        dataset.set_use_var(dcn_model.data_list)
-        pipe_command = 'python reader.py {}'.format(args.vocab_dir)
-        dataset.set_pipe_command(pipe_command)
-        dataset.set_batch_size(args.batch_size)
-        dataset.set_thread(args.num_thread)
-        train_filelist = [
-            os.path.join(args.train_data_dir, fname)
-            for fname in next(os.walk(args.train_data_dir))[2]
-        ]
-        dataset.set_filelist(train_filelist)
-
-        if args.use_gpu == 1:
-            exe = fluid.Executor(fluid.CUDAPlace(0))
-            dataset.set_thread(1)
-        else:
-            exe = fluid.Executor(fluid.CPUPlace())
-            dataset.set_thread(args.num_thread)
-        exe.run(fluid.default_startup_program())
-
-        for epoch_id in range(args.num_epoch):
-            start = time.time()
-            sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
-            exe.train_from_dataset(
-                program=main_program,
-                dataset=dataset,
-                fetch_list=[
-                    dcn_model.loss, dcn_model.avg_logloss, dcn_model.auc_var
-                ],
-                fetch_info=['total_loss', 'avg_logloss', 'auc'],
-                debug=False,
-                print_period=args.print_steps)
-            model_dir = os.path.join(args.model_output_dir,
-                                     'epoch_' + str(epoch_id + 1), "checkpoint")
-            sys.stderr.write('epoch%d is finished and takes %f s\n' % (
-                (epoch_id + 1), time.time() - start))
-            if args.trainer_id == 0:  # only trainer 0 save model
-                print("save model in {}".format(model_dir))
-                fluid.save(main_program, model_dir)
-
-        print("train time cost {:.4f}".format(time.time() - start_time))
-        print("finish training")
-
-    if args.is_local:
-        print("run local training")
-        train_loop(fluid.default_main_program())
-    else:
-        print("run distribute training")
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
-        if args.role == "pserver":
-            print("run psever")
-            pserver_prog, pserver_startup = t.get_pserver_programs(
-                args.current_endpoint)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif args.role == "trainer":
-            print("run trainer")
-            train_loop(t.get_trainer_program())
-
-
-if __name__ == "__main__":
-    utils.check_version()
-    train()
diff --git a/PaddleRec/ctr/dcn/cluster_train.sh b/PaddleRec/ctr/dcn/cluster_train.sh
deleted file mode 100755
index 4088a8af906ec033168701a8b8805ad3846b634a..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/dcn/cluster_train.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-#export GLOG_v=30
-#export GLOG_logtostderr=1
-
-# start pserver0
-python -u cluster_train.py \
-    --train_data_dir dist_data/dist_train_data \
-    --model_output_dir cluster_model \
-    --is_local 0 \
-    --is_sparse \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6000 \
-    --trainers 2 \
-    > pserver0.log 2>&1 &
-
-# start pserver1
-python -u cluster_train.py \
-    --train_data_dir dist_data/dist_train_data \
-    --model_output_dir cluster_model \
-    --is_local 0 \
-    --is_sparse \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6001 \
-    --trainers 2 \
-    > pserver1.log 2>&1 &
-
-# start trainer0
-#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
-python -u cluster_train.py \
-    --train_data_dir dist_data/dist_train_data \
-    --model_output_dir cluster_model \
-    --use_gpu 0 \
-    --is_local 0 \
-    --is_sparse \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 0 \
-    > trainer0.log 2>&1 &
-
-# start trainer1
-#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
-python -u cluster_train.py \
-    --train_data_dir dist_data/dist_train_data \
-    --model_output_dir cluster_model \
-    --use_gpu 0 \
-    --is_local 0 \
-    --is_sparse \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 1 \
-    > trainer1.log 2>&1 &
-
-echo "2 pservers and 2 trainers started."
\ No newline at end of file
diff --git a/PaddleRec/ctr/dcn/network.py b/PaddleRec/ctr/dcn/network.py
index ffa399edc61134146b3cca33d1e5f0adfc733659..8dd6503847fd46ccdebc3f75887173201fab15e1 100644
--- a/PaddleRec/ctr/dcn/network.py
+++ b/PaddleRec/ctr/dcn/network.py
@@ -76,11 +76,10 @@ class DCN(object):
 
     def backward(self, lr):
         p_g_clip = fluid.backward.append_backward(loss=self.loss)
-        fluid.clip.set_gradient_clip(
-            fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_by_norm))
+        clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_by_norm)
         p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
 
-        optimizer = fluid.optimizer.Adam(learning_rate=lr)
+        optimizer = fluid.optimizer.Adam(learning_rate=lr, grad_clip=clip)
         # params_grads = optimizer.backward(self.loss)
         optimizer.apply_gradients(p_g_clip)
 
diff --git a/PaddleRec/ctr/dcn/reader.py b/PaddleRec/ctr/dcn/reader.py
index 291fc988edb3683aec5ef529ec78fbd87897fc72..d121f9fd3c8104ba8c24f320f184ffd282488080 100644
--- a/PaddleRec/ctr/dcn/reader.py
+++ b/PaddleRec/ctr/dcn/reader.py
@@ -72,8 +72,8 @@ class CriteoDataset(dg.MultiSlotDataGenerator):
                         yield label_feat_list
 
         import paddle
-        batch_iter = paddle.batch(
-            paddle.reader.buffered(
+        batch_iter = fluid.io.batch(
+            fluid.io.buffered(
                 local_iter, size=buf_size), batch_size=batch)
         return batch_iter
 
diff --git a/PaddleRec/ctr/deepfm/cluster_train.py b/PaddleRec/ctr/deepfm/cluster_train.py
deleted file mode 100644
index da565172c16be7bd3aa0533d4bdd5053cf79f937..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/deepfm/cluster_train.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import argparse
-import os
-import sys
-import time
-from network_conf import ctr_deepfm_model
-
-import paddle.fluid as fluid
-import utils
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("deepfm cluster train.")
-
-    parser.add_argument(
-        '--train_data_dir',
-        type=str,
-        default='dist_data/dist_train_data',
-        help='The path of train data (default: data/train_data)')
-    parser.add_argument(
-        '--test_data_dir',
-        type=str,
-        default='dist_data/dist_test_data',
-        help='The path of test data (default: models)')
-    parser.add_argument(
-        '--feat_dict',
-        type=str,
-        default='dist_data/aid_data/feat_dict_10.pkl2',
-        help='The path of feat_dict')
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=100,
-        help="The size of mini-batch (default:100)")
-    parser.add_argument(
-        '--embedding_size',
-        type=int,
-        default=10,
-        help="The size for embedding layer (default:10)")
-    parser.add_argument(
-        '--num_epoch',
-        type=int,
-        default=10,
-        help="The number of epochs to train (default: 50)")
-    parser.add_argument(
-        '--model_output_dir',
-        type=str,
-        required=True,
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--num_thread',
-        type=int,
-        default=1,
-        help='The number of threads (default: 1)')
-    parser.add_argument('--test_epoch', type=str, default='1')
-    parser.add_argument(
-        '--layer_sizes',
-        nargs='+',
-        type=int,
-        default=[400, 400, 400],
-        help='The size of each layers (default: [10, 10, 10])')
-    parser.add_argument(
-        '--act',
-        type=str,
-        default='relu',
-        help='The activation of each layers (default: relu)')
-    parser.add_argument(
-        '--is_sparse',
-        action='store_true',
-        required=False,
-        default=False,
-        help='embedding will use sparse or not, (default: False)')
-    parser.add_argument(
-        '--lr', type=float, default=1e-4, help='Learning rate (default: 1e-4)')
-    parser.add_argument(
-        '--reg', type=float, default=1e-4, help=' (default: 1e-4)')
-    parser.add_argument('--num_field', type=int, default=39)
-    parser.add_argument('--num_feat', type=int, default=141443)
-    parser.add_argument('--use_gpu', type=int, default=1)
-
-    # dist params
-    parser.add_argument('--is_local', type=int, default=1, help='whether local')
-    parser.add_argument(
-        '--num_devices', type=int, default=1, help='Number of GPU devices')
-    parser.add_argument(
-        '--role', type=str, default='pserver', help='trainer or pserver')
-    parser.add_argument(
-        '--endpoints',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The current_endpoint')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='trainer id ,only trainer_id=0 save model')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trianers, (default: 1)')
-    args = parser.parse_args()
-    return args
-
-
-def train():
-    """ do training """
-    args = parse_args()
-    print(args)
-
-    if args.trainer_id == 0 and not os.path.isdir(args.model_output_dir):
-        os.mkdir(args.model_output_dir)
-
-    loss, auc, data_list, auc_states = ctr_deepfm_model(
-        args.embedding_size, args.num_field, args.num_feat, args.layer_sizes,
-        args.act, args.reg, args.is_sparse)
-    optimizer = fluid.optimizer.SGD(
-        learning_rate=args.lr,
-        regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
-    optimizer.minimize(loss)
-
-    def train_loop(main_program):
-        """ train network """
-        start_time = time.time()
-        dataset = fluid.DatasetFactory().create_dataset()
-        dataset.set_use_var(data_list)
-        pipe_command = 'python criteo_reader.py {}'.format(args.feat_dict)
-        dataset.set_pipe_command(pipe_command)
-        dataset.set_batch_size(args.batch_size)
-        dataset.set_thread(args.num_thread)
-        train_filelist = [
-            os.path.join(args.train_data_dir, x)
-            for x in os.listdir(args.train_data_dir)
-        ]
-
-        if args.use_gpu == 1:
-            exe = fluid.Executor(fluid.CUDAPlace(0))
-            dataset.set_thread(1)
-        else:
-            exe = fluid.Executor(fluid.CPUPlace())
-            dataset.set_thread(args.num_thread)
-        exe.run(fluid.default_startup_program())
-
-        for epoch_id in range(args.num_epoch):
-            start = time.time()
-            sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
-            dataset.set_filelist(train_filelist)
-            exe.train_from_dataset(
-                program=main_program,
-                dataset=dataset,
-                fetch_list=[loss, auc],
-                fetch_info=['epoch %d batch loss' % (epoch_id + 1), "auc"],
-                print_period=5,
-                debug=False)
-            model_dir = os.path.join(args.model_output_dir,
-                                     'epoch_' + str(epoch_id + 1))
-            sys.stderr.write('epoch%d is finished and takes %f s\n' % (
-                (epoch_id + 1), time.time() - start))
-            if args.trainer_id == 0:  # only trainer 0 save model
-                print("save model in {}".format(model_dir))
-                fluid.save(main_program, model_dir)
-
-        print("train time cost {:.4f}".format(time.time() - start_time))
-        print("finish training")
-
-    if args.is_local:
-        print("run local training")
-        train_loop(fluid.default_main_program())
-    else:
-        print("run distribute training")
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
-        if args.role == "pserver":
-            print("run psever")
-            pserver_prog, pserver_startup = t.get_pserver_programs(
-                args.current_endpoint)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif args.role == "trainer":
-            print("run trainer")
-            train_loop(t.get_trainer_program())
-
-
-if __name__ == "__main__":
-    utils.check_version()
-    train()
diff --git a/PaddleRec/ctr/deepfm/cluster_train.sh b/PaddleRec/ctr/deepfm/cluster_train.sh
deleted file mode 100755
index 4088a8af906ec033168701a8b8805ad3846b634a..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/deepfm/cluster_train.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-#export GLOG_v=30
-#export GLOG_logtostderr=1
-
-# start pserver0
-python -u cluster_train.py \
-    --train_data_dir dist_data/dist_train_data \
-    --model_output_dir cluster_model \
-    --is_local 0 \
-    --is_sparse \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6000 \
-    --trainers 2 \
-    > pserver0.log 2>&1 &
-
-# start pserver1
-python -u cluster_train.py \
-    --train_data_dir dist_data/dist_train_data \
-    --model_output_dir cluster_model \
-    --is_local 0 \
-    --is_sparse \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6001 \
-    --trainers 2 \
-    > pserver1.log 2>&1 &
-
-# start trainer0
-#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
-python -u cluster_train.py \
-    --train_data_dir dist_data/dist_train_data \
-    --model_output_dir cluster_model \
-    --use_gpu 0 \
-    --is_local 0 \
-    --is_sparse \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 0 \
-    > trainer0.log 2>&1 &
-
-# start trainer1
-#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
-python -u cluster_train.py \
-    --train_data_dir dist_data/dist_train_data \
-    --model_output_dir cluster_model \
-    --use_gpu 0 \
-    --is_local 0 \
-    --is_sparse \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 1 \
-    > trainer1.log 2>&1 &
-
-echo "2 pservers and 2 trainers started."
\ No newline at end of file
diff --git a/PaddleRec/ctr/deepfm/infer.py b/PaddleRec/ctr/deepfm/infer.py
index 9ff58af7dc80cec637d590638b62ca4d300939ea..22b9a95eeb2e79258c89ce9cb51078d550685c10 100644
--- a/PaddleRec/ctr/deepfm/infer.py
+++ b/PaddleRec/ctr/deepfm/infer.py
@@ -30,7 +30,7 @@ def infer():
     ]
     criteo_dataset = CriteoDataset()
     criteo_dataset.setup(args.feat_dict)
-    test_reader = paddle.batch(
+    test_reader = fluid.io.batch(
         criteo_dataset.test(test_files), batch_size=args.batch_size)
 
     startup_program = fluid.framework.Program()
diff --git a/PaddleRec/ctr/deepfm_dygraph/data_reader.py b/PaddleRec/ctr/deepfm_dygraph/data_reader.py
index 7c9d9abcd8f1b64adf903703f5eec995e04016ae..0cd800cb7c9cf510c97f68d61263df949f53bf60 100644
--- a/PaddleRec/ctr/deepfm_dygraph/data_reader.py
+++ b/PaddleRec/ctr/deepfm_dygraph/data_reader.py
@@ -6,6 +6,7 @@ import pickle
 import random
 
 import paddle
+import paddle.fluid as fluid
 
 
 class DataGenerator(object):
@@ -58,7 +59,7 @@ class DataGenerator(object):
                 if not cycle:
                     break
 
-        return paddle.batch(_reader, batch_size=batch_size)
+        return fluid.io.batch(_reader, batch_size=batch_size)
 
 
 def data_reader(batch_size,
diff --git a/PaddleRec/ctr/din/README.md b/PaddleRec/ctr/din/README.md
index 8644a75c8053769049031aab415f86f71f171138..ea8585c0c5bf459771c81ea90d6d5a4d5c8771d7 100644
--- a/PaddleRec/ctr/din/README.md
+++ b/PaddleRec/ctr/din/README.md
@@ -8,8 +8,6 @@
 ├── train.py             # 训练脚本
 ├── infer.py             # 预测脚本
 ├── network.py           # 网络结构
-├── cluster_train.py     # 多机训练
-├── cluster_train.sh     # 多机训练脚本
 ├── reader.py            # 和读取数据相关的函数
 ├── data/
     ├── build_dataset.py    # 文本数据转化为paddle数据
@@ -129,12 +127,3 @@ CUDA_VISIBLE_DEVICES=3 python infer.py --model_path 'din_amazon/global_step_4000
 ```text
 2019-02-22 11:22:58,804 - INFO - TEST --> loss: [0.47005194] auc:0.863794952818
 ```
-
-
-## 多机训练
-可参考cluster_train.py 配置多机环境
-
-运行命令本地模拟多机场景
-```
-sh cluster_train.sh
-```
diff --git a/PaddleRec/ctr/din/cluster_train.py b/PaddleRec/ctr/din/cluster_train.py
deleted file mode 100644
index 6b3272366fa674c2bfaa6454beb2c93de1545a4f..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/din/cluster_train.py
+++ /dev/null
@@ -1,172 +0,0 @@
-import sys
-import logging
-import time
-import numpy as np
-import argparse
-import paddle.fluid as fluid
-import paddle
-import time
-import network
-import reader
-import random
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("din")
-    parser.add_argument(
-        '--config_path',
-        type=str,
-        default='data/config.txt',
-        help='dir of config')
-    parser.add_argument(
-        '--train_dir',
-        type=str,
-        default='data/paddle_train.txt',
-        help='dir of train file')
-    parser.add_argument(
-        '--model_dir',
-        type=str,
-        default='din_amazon/',
-        help='dir of saved model')
-    parser.add_argument(
-        '--batch_size', type=int, default=16, help='number of batch size')
-    parser.add_argument(
-        '--epoch_num', type=int, default=200, help='number of epoch')
-    parser.add_argument(
-        '--use_cuda', type=int, default=0, help='whether to use gpu')
-    parser.add_argument(
-        '--parallel',
-        type=int,
-        default=0,
-        help='whether to use parallel executor')
-    parser.add_argument(
-        '--base_lr', type=float, default=0.85, help='based learning rate')
-    parser.add_argument(
-        '--role', type=str, default='pserver', help='trainer or pserver')
-    parser.add_argument(
-        '--endpoints',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The current_endpoint')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='trainer id ,only trainer_id=0 save model')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trianers, (default: 1)')
-    args = parser.parse_args()
-    return args
-
-
-def train():
-    args = parse_args()
-
-    config_path = args.config_path
-    train_path = args.train_dir
-    epoch_num = args.epoch_num
-    use_cuda = True if args.use_cuda else False
-    use_parallel = True if args.parallel else False
-
-    logger.info("reading data begins")
-    user_count, item_count, cat_count = reader.config_read(config_path)
-    #data_reader, max_len = reader.prepare_reader(train_path, args.batch_size)
-    logger.info("reading data completes")
-
-    avg_cost, pred = network.network(item_count, cat_count, 433)
-    #fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
-    base_lr = args.base_lr
-    boundaries = [410000]
-    values = [base_lr, 0.2]
-    sgd_optimizer = fluid.optimizer.SGD(
-        learning_rate=fluid.layers.piecewise_decay(
-            boundaries=boundaries, values=values))
-    sgd_optimizer.minimize(avg_cost)
-
-    def train_loop(main_program):
-        data_reader, max_len = reader.prepare_reader(train_path,
-                                                     args.batch_size)
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        feeder = fluid.DataFeeder(
-            feed_list=[
-                "hist_item_seq", "hist_cat_seq", "target_item", "target_cat",
-                "label", "mask", "target_item_seq", "target_cat_seq"
-            ],
-            place=place)
-        if use_parallel:
-            train_exe = fluid.ParallelExecutor(
-                use_cuda=use_cuda,
-                loss_name=avg_cost.name,
-                main_program=main_program)
-        else:
-            train_exe = exe
-        logger.info("train begins")
-        global_step = 0
-        PRINT_STEP = 1000
-
-        start_time = time.time()
-        loss_sum = 0.0
-        for id in range(epoch_num):
-            epoch = id + 1
-            for data in data_reader():
-                global_step += 1
-                results = train_exe.run(main_program,
-                                        feed=feeder.feed(data),
-                                        fetch_list=[avg_cost.name, pred.name],
-                                        return_numpy=True)
-                loss_sum += results[0].mean()
-
-                if global_step % PRINT_STEP == 0:
-                    logger.info(
-                        "epoch: %d\tglobal_step: %d\ttrain_loss: %.4f\t\ttime: %.2f"
-                        % (epoch, global_step, loss_sum / PRINT_STEP,
-                           time.time() - start_time))
-                    start_time = time.time()
-                    loss_sum = 0.0
-
-                    if (global_step > 400000 and
-                            global_step % PRINT_STEP == 0) or (
-                                global_step < 400000 and
-                                global_step % 50000 == 0):
-                        save_dir = args.model_dir + "/global_step_" + str(
-                            global_step)
-                        feed_var_name = [
-                            "hist_item_seq", "hist_cat_seq", "target_item",
-                            "target_cat", "label", "mask", "target_item_seq",
-                            "target_cat_seq"
-                        ]
-                        fetch_vars = [avg_cost, pred]
-                        fluid.io.save_inference_model(save_dir, feed_var_name,
-                                                      fetch_vars, exe)
-        train_exe.close()
-
-    t = fluid.DistributeTranspiler()
-    t.transpile(
-        args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
-    if args.role == "pserver":
-        logger.info("run psever")
-        prog, startup = t.get_pserver_programs(args.current_endpoint)
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(startup)
-        exe.run(prog)
-    elif args.role == "trainer":
-        logger.info("run trainer")
-        train_loop(t.get_trainer_program())
-
-
-if __name__ == "__main__":
-    train()
diff --git a/PaddleRec/ctr/din/cluster_train.sh b/PaddleRec/ctr/din/cluster_train.sh
deleted file mode 100644
index 76115c825423f5de4a1114be863cc7ec40bad0b4..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/din/cluster_train.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-
-#export GLOG_v=30
-#export GLOG_logtostderr=1
-
-python -u cluster_train.py \
---config_path 'data/config.txt' \
---train_dir 'data/paddle_train.txt' \
---batch_size 32 \
---epoch_num 100 \
---use_cuda 0 \
---parallel 0 \
---role pserver \
---endpoints 127.0.0.1:6000,127.0.0.1:6001 \
---current_endpoint 127.0.0.1:6000 \
---trainers 2 \
-> pserver0.log 2>&1 &
-
-python -u cluster_train.py \
---config_path 'data/config.txt' \
---train_dir 'data/paddle_train.txt' \
---batch_size 32 \
---epoch_num 100 \
---use_cuda 0 \
---parallel 0 \
---role pserver \
---endpoints 127.0.0.1:6000,127.0.0.1:6001 \
---current_endpoint 127.0.0.1:6001 \
---trainers 2 \
-> pserver1.log 2>&1 &
-
-python -u cluster_train.py \
---config_path 'data/config.txt' \
---train_dir 'data/paddle_train.txt' \
---batch_size 32 \
---epoch_num 100 \
---use_cuda 0 \
---parallel 0 \
---role trainer \
---endpoints 127.0.0.1:6000,127.0.0.1:6001 \
---trainers 2 \
---trainer_id 0 \
-> trainer0.log 2>&1 &
-
-python -u cluster_train.py \
---config_path 'data/config.txt' \
---train_dir 'data/paddle_train.txt' \
---batch_size 32 \
---epoch_num 100 \
---use_cuda 0 \
---parallel 0 \
---role trainer \
---endpoints 127.0.0.1:6000,127.0.0.1:6001 \
---trainers 2 \
---trainer_id 1 \
-> trainer1.log 2>&1 &
diff --git a/PaddleRec/ctr/din/train.py b/PaddleRec/ctr/din/train.py
index a519721101b14f4d6de717f8a1f061ece5e85e7f..4461eb6729fbb262d8265c9fc6ca1eff172ea3e2 100644
--- a/PaddleRec/ctr/din/train.py
+++ b/PaddleRec/ctr/din/train.py
@@ -92,14 +92,15 @@ def train():
     logger.info("reading data completes")
 
     avg_cost, pred, feed_list = network.network(item_count, cat_count)
-    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
-        clip_norm=5.0))
+
+    clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
     base_lr = args.base_lr
     boundaries = [410000]
     values = [base_lr, 0.2]
     sgd_optimizer = fluid.optimizer.SGD(
         learning_rate=fluid.layers.piecewise_decay(
-            boundaries=boundaries, values=values))
+            boundaries=boundaries, values=values),
+        grad_clip=clip)
     sgd_optimizer.minimize(avg_cost)
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
diff --git a/PaddleRec/ctr/dnn/README.md b/PaddleRec/ctr/dnn/README.md
index fdfdb1c4d0e1c2ba4f13d21aea6c94b461263d57..989de5a0f7064e046537836e1b3aec6150d9d98c 100644
--- a/PaddleRec/ctr/dnn/README.md
+++ b/PaddleRec/ctr/dnn/README.md
@@ -154,7 +154,7 @@ def embedding_layer(input):
    return fluid.layers.embedding(
             input=input,
             is_sparse=True,
-            size=[args.sparse_feature_dim, 
+            size=[args.sparse_feature_dim,
                   args.embedding_size],
             param_attr=fluid.ParamAttr(
             name="SparseFeatFactors",
@@ -168,7 +168,7 @@ sparse_embed_seq = list(map(embedding_layer, inputs[1:-1])) # [C1~C26]
 将离散数据通过embedding查表得到的值，与连续数据的输入进行`concat`操作，合为一个整体输入，作为全链接层的原始输入。我们共设计了3层FC，每层FC的输出维度都为400，每层FC都后接一个`relu`激活函数，每层FC的初始化方式为符合正态分布的随机初始化，标准差与上一层的输出维度的平方根成反比。
 ```python
 concated = fluid.layers.concat(sparse_embed_seq + inputs[0:1], axis=1)
-        
+
 fc1 = fluid.layers.fc(
    input=concated,
    size=400,
@@ -271,9 +271,9 @@ continuous_range_ = range(1, 14)
 categorical_range_ = range(14, 40)
 
 class CriteoDataset(dg.MultiSlotDataGenerator):
-   
+
     def generate_sample(self, line):
-        
+
         def reader():
             features = line.rstrip('\n').split('\t')
             dense_feature = []
@@ -363,12 +363,12 @@ fleet.init(role) #必不可少的步骤，初始化节点！
 
 
 > PaddleCloudRoleMaker()是怎样判断当前节点所扮演的角色的？
-> 
+>
 > Paddle参数服务器模式中，使用各个节点机器的环境变量来确定当前节点的角色。为了能准确无误的分配角色，在每个节点上，我们都需要指定如下环境变量：
 > #### 共有的环境变量
 > - export PADDLE_TRAINERS_NUM=2 # 训练节点数
 > - export PADDLE_PSERVERS_IP_PORT_LIST="127.0.0.1:36011,127.0.0.1:36012" # 各个pserver的ip:port 组合构成的字符串
-> 
+>
 > #### Pserver特有的环境变量
 > - export TRAINING_ROLE=PSERVER # 当前节点的角色是PSERVER
 > - export PADDLE_PORT=36011 # 当前PSERVER的通信端口
@@ -376,7 +376,7 @@ fleet.init(role) #必不可少的步骤，初始化节点！
 > #### Trainer特有的环境变量
 > - export TRAINING_ROLE=TRAINER # 当前节点的角色是TRAINER
 > - export PADDLE_TRAINER_ID=0 # 当前Trainer节点的编号,范围为[0，PADDLE_TRAINERS_NUM)
-> 
+>
 > 完成上述环境变量指定后，`PaddleCloudRoleMaker()`便可以正常的运行，决定当前节点的角色。
 
 
@@ -388,7 +388,7 @@ Paddle的`参数服务器`模式分布式训练有很多种类型，根据通信
 ctr_model = CTR()
 inputs = ctr_model.input_data(args)
 avg_cost, auc_var, batch_auc_var = ctr_model.net(inputs,args)
-    
+
 # 选择反向更新优化策略
 optimizer = fluid.optimizer.Adam(args.learning_rate)
 optimizer.minimize(avg_cost)
@@ -431,7 +431,7 @@ if fleet.is_server():
     fleet.run_server()
 ```
 - 启动Worker
- 
+
 启动训练节点，训练节点首先调用`init_worker()`来完成节点初始化，然后执行`fleet.startup_program`，从服务器端同步参数的初始化值。接着，和本地训练完全一致，通过执行`fleet.main_program`来完成整个训练过程，并保存模型。最后调用`fleet.stop_worker()`关闭训练节点。
 ```python
 elif fleet.is_worker():
@@ -441,7 +441,7 @@ elif fleet.is_worker():
 
     # 初始化含有分布式流程的fleet.startup_program
     exe.run(fleet.startup_program))
-    
+
     # 引入数据读取dataset
     dataset = get_dataset(inputs,params)
 
@@ -458,10 +458,10 @@ elif fleet.is_worker():
         # 默认使用0号节点保存模型
         if params.test and fleet.is_first_worker():
             model_path = (str(params.model_path) + "/"+"epoch_" + str(epoch))
-            fluid.io.save_persistables(executor=exe, dirname=model_path)
-    
+            fleet.save_persistables(executor=exe, dirname=model_path)
+
     # 训练结束，调用stop_worker()通知pserver
-    fleet.stop_worker() 
+    fleet.stop_worker()
     logger.info("Distribute Train Success!")
     return train_result
 ```
@@ -504,7 +504,7 @@ sh local_cluster.sh
 便可以开启分布式模拟训练，默认启用2x2的训练模式。Trainer与Pserver的运行日志，存放于`./log/`文件夹，保存的模型位于`./models/`，使用默认配置运行后，理想输出为：
 - pserver.0.log
 ```bash
-get_pserver_program() is deprecated, call get_pserver_programs() to get pserver main and startup in a single call.
+
 I1126 07:37:49.952580 15056 grpc_server.cc:477] Server listening on 127.0.0.1:36011 successful, selected port: 36011
 ```
 
@@ -558,9 +558,9 @@ I1126 07:38:28.947571 14715 communicator.cc:363] Communicator stop done
 2. 在很多应用场景中，分布式训练出的模型与实际上线的模型不一致，仅使用分布式训练出的参数值，参与其他网络的预测，在这样的场景中，就更无必要保存模型结构了。
 
 > 什么是长期变量？
-> 
+>
 > 在Paddle Fluid中，模型变量可以分为以下几种类型：
-> 
+>
 > 1. 模型参数：是深度学习模型中被训练和学习的量。由`fluid.framwork.Parameter()`产生，是`fluid.framework.Variable()`的派生类。
 > 2. 长期变量 ：是在整个训练过程中持续存在，不会因为一个迭代结束而销毁的变量，所有的模型参数都是长期变量，但并非所有的长期变量都是模型参数。长期变量通过将`fluid.framework.Varibale()`中的`psersistable`属性设置为`True`来声明。长期变量是模型的核心参数。
 > 3. 临时变量：不属于上述两种类别的所有变量都是临时变量，只在一个训练迭代中存在，在每一个迭代结束后，所有的临时变量都会被销毁，然后在下一个迭代开始时，创建新的临时变量。例如输入的训练数据，中间层layer的输出等等。
@@ -632,7 +632,7 @@ with fluid.framework.program_guard(test_program, startup_program):
    ```
    这是容易理解的，因为在测试时，我们要从零开始，保证预测program的干净，没有其他的影响因素。
 -  在创建预测网络时，我们加入了`with fluid.unique_name.guard():`，它的作用是让所有新建的参数的自动编号再次从零开始。Paddle的参数`Variable`以变量名作为区分手段，保证变量名相同，就可以从保存的模型中找到对应参数。
-  
+
     paddle创建的临时变量，编号会自动顺延，如果没有指定变量名，可以观察到这一现象，比如：`fc_1.w_0`->`fc_2.w_0`，想要共享相同的参数，必需要保证编号可以对应。
 
 ### 测试数据的读取
@@ -774,14 +774,14 @@ python -u train.py --is_cloud=1
 
 运行该命令时，若pserver还未就绪，可在日志输出中看到如下信息：
 > server not ready, wait 3 sec to retry...
-> 
+>
 > not ready endpoints:['10.89.176.11:36000', '10.89.176.12:36000']
 
 worker进程将持续等待，直到server开始监听，或等待超时。
 
 当pserver都准备就绪后，可以在日志输出看到如下信息：
 > I0317 11:38:48.099179 16719 communicator.cc:271] Communicator start
-> 
+>
 > I0317 11:38:49.838711 16719 rpc_client.h:107] init rpc client with trainer_id 0
 
 至此，分布式训练启动完毕，将开始训练，祝您好运。
diff --git a/PaddleRec/ctr/dnn/infer.py b/PaddleRec/ctr/dnn/infer.py
index 11bbdbaddb61b75d02e44639494692768ca29766..19a97e30fbe6196f33531c948ceb67dce9d46931 100644
--- a/PaddleRec/ctr/dnn/infer.py
+++ b/PaddleRec/ctr/dnn/infer.py
@@ -30,8 +30,7 @@ logger.setLevel(logging.INFO)
 
 
 def parse_args():
-    parser = argparse.ArgumentParser(
-        description="PaddlePaddle CTR-DNN example")
+    parser = argparse.ArgumentParser(description="PaddlePaddle CTR-DNN example")
     # -------------Data & Model Path-------------
     parser.add_argument(
         '--test_files_path',
@@ -54,8 +53,7 @@ def parse_args():
         '--infer_epoch',
         type=int,
         default=0,
-        help='Specify which epoch to run infer'
-    )
+        help='Specify which epoch to run infer')
     # -------------Network parameter-------------
     parser.add_argument(
         '--embedding_size',
@@ -68,10 +66,7 @@ def parse_args():
         default=1000001,
         help='sparse feature hashing space for index processing')
     parser.add_argument(
-        '--dense_feature_dim',
-        type=int,
-        default=13,
-        help='dense feature shape')
+        '--dense_feature_dim', type=int, default=13, help='dense feature shape')
 
     # -------------device parameter-------------
     parser.add_argument(
@@ -102,10 +97,11 @@ def run_infer(args, model_path):
     place = fluid.CPUPlace()
     train_generator = generator.CriteoDataset(args.sparse_feature_dim)
     file_list = [
-        os.path.join(args.test_files_path, x) for x in os.listdir(args.test_files_path)
+        os.path.join(args.test_files_path, x)
+        for x in os.listdir(args.test_files_path)
     ]
-    test_reader = paddle.batch(train_generator.test(file_list),
-                               batch_size=args.batch_size)
+    test_reader = fluid.io.batch(
+        train_generator.test(file_list), batch_size=args.batch_size)
     startup_program = fluid.framework.Program()
     test_program = fluid.framework.Program()
     ctr_model = CTR()
@@ -171,13 +167,15 @@ if __name__ == "__main__":
     model_list = []
     for _, dir, _ in os.walk(args.model_path):
         for model in dir:
-            if "epoch" in model and args.infer_epoch == int(model.split('_')[-1]):
+            if "epoch" in model and args.infer_epoch == int(
+                    model.split('_')[-1]):
                 path = os.path.join(args.model_path, model)
                 model_list.append(path)
 
     if len(model_list) == 0:
-        logger.info("There is no satisfactory model {} at path {}, please check your start command & env. ".format(
-            str("epoch_")+str(args.infer_epoch), args.model_path))
+        logger.info(
+            "There is no satisfactory model {} at path {}, please check your start command & env. ".
+            format(str("epoch_") + str(args.infer_epoch), args.model_path))
 
     for model in model_list:
         logger.info("Test model {}".format(model))
diff --git a/PaddleRec/ctr/xdeepfm/cluster_train.py b/PaddleRec/ctr/xdeepfm/cluster_train.py
deleted file mode 100644
index 77e1e1526d3bf09f201ce494523274e0a92b4cfc..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/xdeepfm/cluster_train.py
+++ /dev/null
@@ -1,198 +0,0 @@
-import argparse
-import os
-import sys
-import time
-import network_conf
-
-import paddle.fluid as fluid
-import utils
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("xdeepfm cluster train.")
-
-    parser.add_argument(
-        '--train_data_dir',
-        type=str,
-        default='data/train_data',
-        help='The path of train data (default: data/train_data)')
-    parser.add_argument(
-        '--test_data_dir',
-        type=str,
-        default='data/test_data',
-        help='The path of test data (default: models)')
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=100,
-        help="The size of mini-batch (default:100)")
-    parser.add_argument(
-        '--embedding_size',
-        type=int,
-        default=10,
-        help="The size for embedding layer (default:10)")
-    parser.add_argument(
-        '--num_epoch',
-        type=int,
-        default=10,
-        help="The number of epochs to train (default: 10)")
-    parser.add_argument(
-        '--model_output_dir',
-        type=str,
-        required=True,
-        help='The path for model to store (default: models)')
-    parser.add_argument(
-        '--num_thread',
-        type=int,
-        default=1,
-        help='The number of threads (default: 1)')
-    parser.add_argument('--test_epoch', type=str, default='1')
-    parser.add_argument(
-        '--layer_sizes_dnn',
-        nargs='+',
-        type=int,
-        default=[10, 10, 10],
-        help='The size of each layers')
-    parser.add_argument(
-        '--layer_sizes_cin',
-        nargs='+',
-        type=int,
-        default=[10, 10],
-        help='The size of each layers')
-    parser.add_argument(
-        '--act',
-        type=str,
-        default='relu',
-        help='The activation of each layers (default: relu)')
-    parser.add_argument(
-        '--lr', type=float, default=1e-1, help='Learning rate (default: 1e-4)')
-    parser.add_argument(
-        '--reg', type=float, default=1e-4, help=' (default: 1e-4)')
-    parser.add_argument('--num_field', type=int, default=39)
-    parser.add_argument('--num_feat', type=int, default=28651)
-    parser.add_argument(
-        '--model_name',
-        type=str,
-        default='ctr_xdeepfm_model',
-        help='The name of model (default: ctr_xdeepfm_model)')
-    parser.add_argument('--use_gpu', type=int, default=1)
-    parser.add_argument('--print_steps', type=int, default=50)
-    parser.add_argument('--is_local', type=int, default=1, help='whether local')
-    parser.add_argument(
-        '--is_sparse',
-        action='store_true',
-        required=False,
-        default=False,
-        help='embedding will use sparse or not, (default: False)')
-
-    # dist params
-    parser.add_argument(
-        '--num_devices', type=int, default=1, help='Number of GPU devices')
-    parser.add_argument(
-        '--role', type=str, default='pserver', help='trainer or pserver')
-    parser.add_argument(
-        '--endpoints',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The current_endpoint')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='trainer id ,only trainer_id=0 save model')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trianers, (default: 1)')
-    args = parser.parse_args()
-    return args
-
-
-def train():
-    """ do training """
-    args = parse_args()
-    print(args)
-
-    if not os.path.isdir(args.model_output_dir):
-        os.mkdir(args.model_output_dir)
-
-    loss, auc, data_list, auc_states = eval('network_conf.' + args.model_name)(
-        args.embedding_size, args.num_field, args.num_feat,
-        args.layer_sizes_dnn, args.act, args.reg, args.layer_sizes_cin,
-        args.is_sparse)
-    optimizer = fluid.optimizer.SGD(
-        learning_rate=args.lr,
-        regularization=fluid.regularizer.L2DecayRegularizer(args.reg))
-    optimizer.minimize(loss)
-
-    def train_loop(main_program):
-        """ train network """
-        start_time = time.time()
-        dataset = fluid.DatasetFactory().create_dataset()
-        dataset.set_use_var(data_list)
-        dataset.set_pipe_command('python criteo_reader.py')
-        dataset.set_batch_size(args.batch_size)
-        dataset.set_filelist([
-            os.path.join(args.train_data_dir, x)
-            for x in os.listdir(args.train_data_dir)
-        ])
-
-        if args.use_gpu == 1:
-            exe = fluid.Executor(fluid.CUDAPlace(0))
-            dataset.set_thread(1)
-        else:
-            exe = fluid.Executor(fluid.CPUPlace())
-            dataset.set_thread(args.num_thread)
-        exe.run(fluid.default_startup_program())
-
-        for epoch_id in range(args.num_epoch):
-            start = time.time()
-            sys.stderr.write('\nepoch%d start ...\n' % (epoch_id + 1))
-            exe.train_from_dataset(
-                program=main_program,
-                dataset=dataset,
-                fetch_list=[loss, auc],
-                fetch_info=['loss', 'auc'],
-                debug=False,
-                print_period=args.print_steps)
-            model_dir = os.path.join(args.model_output_dir,
-                                     'epoch_' + str(epoch_id + 1), "checkpoint")
-            sys.stderr.write('epoch%d is finished and takes %f s\n' % (
-                (epoch_id + 1), time.time() - start))
-            if args.trainer_id == 0:  # only trainer 0 save model
-                print("save model in {}".format(model_dir))
-                fluid.save(main_program, model_dir)
-
-        print("train time cost {:.4f}".format(time.time() - start_time))
-        print("finish training")
-
-    if args.is_local:
-        print("run local training")
-        train_loop(fluid.default_main_program())
-    else:
-        print("run distribute training")
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
-        if args.role == "pserver":
-            print("run psever")
-            pserver_prog, pserver_startup = t.get_pserver_programs(
-                args.current_endpoint)
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif args.role == "trainer":
-            print("run trainer")
-            train_loop(t.get_trainer_program())
-
-
-if __name__ == "__main__":
-    utils.check_version()
-    train()
diff --git a/PaddleRec/ctr/xdeepfm/cluster_train.sh b/PaddleRec/ctr/xdeepfm/cluster_train.sh
deleted file mode 100755
index c818a01e44fac09aa18d6e25241895666f73e09e..0000000000000000000000000000000000000000
--- a/PaddleRec/ctr/xdeepfm/cluster_train.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-#export GLOG_v=30
-#export GLOG_logtostderr=1
-
-# start pserver0
-python -u cluster_train.py \
-    --train_data_dir data/train_data \
-    --model_output_dir cluster_model \
-    --is_local 0 \
-    --is_sparse \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6000 \
-    --trainers 2 \
-    > pserver0.log 2>&1 &
-
-# start pserver1
-python -u cluster_train.py \
-    --train_data_dir data/train_data \
-    --model_output_dir cluster_model \
-    --is_local 0 \
-    --is_sparse \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6001 \
-    --trainers 2 \
-    > pserver1.log 2>&1 &
-
-# start trainer0
-#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
-python -u cluster_train.py \
-    --train_data_dir data/train_data \
-    --model_output_dir cluster_model \
-    --use_gpu 0 \
-    --is_local 0 \
-    --is_sparse \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 0 \
-    > trainer0.log 2>&1 &
-
-# start trainer1
-#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
-python -u cluster_train.py \
-    --train_data_dir data/train_data \
-    --model_output_dir cluster_model \
-    --use_gpu 0 \
-    --is_local 0 \
-    --is_sparse \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 1 \
-    > trainer1.log 2>&1 &
-
-echo "2 pservers and 2 trainers started."
\ No newline at end of file
diff --git a/PaddleRec/ctr/xdeepfm/infer.py b/PaddleRec/ctr/xdeepfm/infer.py
index 2b0ac13191ae60b96d12de902dbffa5287723387..cd809fb81fb00c88566c4a8a450ebe0f931e4cb5 100644
--- a/PaddleRec/ctr/xdeepfm/infer.py
+++ b/PaddleRec/ctr/xdeepfm/infer.py
@@ -30,7 +30,7 @@ def infer():
         for x in os.listdir(args.test_data_dir)
     ]
     criteo_dataset = CriteoDataset()
-    test_reader = paddle.batch(
+    test_reader = fluid.io.batch(
         criteo_dataset.test(test_files), batch_size=args.batch_size)
 
     startup_program = fluid.framework.Program()
diff --git a/PaddleRec/gnn/infer.py b/PaddleRec/gnn/infer.py
index f8d1f1116283a5d21d6081155f062b466bda908c..787ee6f584a5664898e4deff79708dbaecc52610 100644
--- a/PaddleRec/gnn/infer.py
+++ b/PaddleRec/gnn/infer.py
@@ -62,10 +62,9 @@ def infer(args):
     for epoch_num in range(args.start_index, args.last_index + 1):
         model_path = os.path.join(args.model_path,  "epoch_" + str(epoch_num))
         try:
-            if not os.path.exists(model_path):
+            if not os.path.exists(model_path + ".pdmodel"):
                 raise ValueError()
-            fluid.io.load_persistables(executor=exe, dirname=model_path,
-                    main_program=infer_program)
+            fluid.io.load(infer_program, model_path+".pdmodel", exe)
 
             loss_sum = 0.0
             acc_sum = 0.0
diff --git a/PaddleRec/gnn/train.py b/PaddleRec/gnn/train.py
index a4a1898ed1c019bccc0ed076d349b76983f0d920..f96b66de3a1a258388aec6b210085c504e576051 100644
--- a/PaddleRec/gnn/train.py
+++ b/PaddleRec/gnn/train.py
@@ -140,7 +140,7 @@ def train():
         logger.info("epoch loss: %.4lf" % (np.mean(epoch_sum)))
         save_dir = os.path.join(args.model_path, "epoch_" + str(i))
         fetch_vars = [loss, acc]
-        fluid.io.save_inference_model(save_dir, feed_list, fetch_vars, exe)
+        fluid.save(fluid.default_main_program(), model_path=save_dir)
         logger.info("model saved in " + save_dir)
 
     # only for ce
diff --git a/PaddleRec/gru4rec/README.md b/PaddleRec/gru4rec/README.md
index b070cd766bec40128b8c3157fe9a5de918324018..40ffb90cb00b1294ddee62106082ad0ca402ff0e 100644
--- a/PaddleRec/gru4rec/README.md
+++ b/PaddleRec/gru4rec/README.md
@@ -11,8 +11,6 @@
 ├── infer_sample_neg.py  # 预测脚本 sample负例
 ├── net.py               # 网络结构
 ├── text2paddle.py       # 文本数据转paddle数据
-├── cluster_train.py     # 多机训练
-├── cluster_train.sh     # 多机训练脚本
 ├── utils                # 通用函数
 ├── convert_format.py    # 转换数据格式
 ├── vocab.txt            # 小样本字典
@@ -168,7 +166,7 @@ CUDA_VISIBLE_DEVICES=0 python train_sample_neg.py --loss ce --use_cuda 1
 
 可在[net.py](./net.py) `network` 函数中调整网络结构，当前的网络结构如下：
 ```python
-emb = fluid.layers.embedding(
+emb = fluid.embedding(
     input=src,
     size=[vocab_size, hid_size],
     param_attr=fluid.ParamAttr(
@@ -278,12 +276,3 @@ model:model_r@20/epoch_10 recall@20:0.681 time_cost(s):12.2
 
 ## 多机训练
 厂内用户可以参考[wiki](http://wiki.baidu.com/pages/viewpage.action?pageId=628300529)利用paddlecloud 配置多机环境
-
-可参考cluster_train.py 配置其他多机环境
-
-运行命令本地模拟多机场景, 暂不支持windows
-```
-sh cluster_train.sh
-```
-
-注意本地模拟需要关闭代理
diff --git a/PaddleRec/gru4rec/cluster_train.py b/PaddleRec/gru4rec/cluster_train.py
deleted file mode 100644
index f50542bf011d0caacddb3831368493df106463f5..0000000000000000000000000000000000000000
--- a/PaddleRec/gru4rec/cluster_train.py
+++ /dev/null
@@ -1,164 +0,0 @@
-import os
-import sys
-import time
-import six
-import numpy as np
-import math
-import argparse
-import paddle.fluid as fluid
-import paddle
-import time
-import utils
-import net
-
-SEED = 102
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("gru4rec benchmark.")
-    parser.add_argument(
-        '--train_dir',
-        type=str,
-        default='train_data',
-        help='train file address')
-    parser.add_argument(
-        '--vocab_path',
-        type=str,
-        default='vocab.txt',
-        help='vocab file address')
-    parser.add_argument('--is_local', type=int, default=1, help='whether local')
-    parser.add_argument('--hid_size', type=int, default=100, help='hid size')
-    parser.add_argument(
-        '--model_dir', type=str, default='model_recall20', help='model dir')
-    parser.add_argument(
-        '--batch_size', type=int, default=5, help='num of batch size')
-    parser.add_argument('--pass_num', type=int, default=10, help='num of epoch')
-    parser.add_argument(
-        '--print_batch', type=int, default=10, help='num of print batch')
-    parser.add_argument(
-        '--use_cuda', type=int, default=0, help='whether use gpu')
-    parser.add_argument(
-        '--base_lr', type=float, default=0.01, help='learning rate')
-    parser.add_argument(
-        '--num_devices', type=int, default=1, help='Number of GPU devices')
-    parser.add_argument(
-        '--role', type=str, default='pserver', help='trainer or pserver')
-    parser.add_argument(
-        '--endpoints',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The current_endpoint')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='trainer id ,only trainer_id=0 save model')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trianers, (default: 1)')
-    args = parser.parse_args()
-    return args
-
-
-def get_cards(args):
-    return args.num_devices
-
-
-def train():
-    """ do training """
-    args = parse_args()
-    hid_size = args.hid_size
-    train_dir = args.train_dir
-    vocab_path = args.vocab_path
-    use_cuda = True if args.use_cuda else False
-    print("use_cuda:", use_cuda)
-    batch_size = args.batch_size
-    vocab_size, train_reader = utils.prepare_data(
-        train_dir, vocab_path, batch_size=batch_size * get_cards(args),\
-        buffer_size=1000, word_freq_threshold=0, is_train=True)
-
-    # Train program
-    src_wordseq, dst_wordseq, avg_cost, acc = net.all_vocab_network(
-        vocab_size=vocab_size, hid_size=hid_size)
-
-    # Optimization to minimize lost
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_lr)
-    sgd_optimizer.minimize(avg_cost)
-
-    def train_loop(main_program):
-        """ train network """
-        pass_num = args.pass_num
-        model_dir = args.model_dir
-        fetch_list = [avg_cost.name]
-
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        exe.run(fluid.default_startup_program())
-        total_time = 0.0
-        for pass_idx in six.moves.xrange(pass_num):
-            epoch_idx = pass_idx + 1
-            print("epoch_%d start" % epoch_idx)
-
-            t0 = time.time()
-            i = 0
-            newest_ppl = 0
-            for data in train_reader():
-                i += 1
-                lod_src_wordseq = utils.to_lodtensor([dat[0] for dat in data],
-                                                     place)
-                lod_dst_wordseq = utils.to_lodtensor([dat[1] for dat in data],
-                                                     place)
-                ret_avg_cost = exe.run(main_program,
-                                       feed={
-                                           "src_wordseq": lod_src_wordseq,
-                                           "dst_wordseq": lod_dst_wordseq
-                                       },
-                                       fetch_list=fetch_list)
-                avg_ppl = np.exp(ret_avg_cost[0])
-                newest_ppl = np.mean(avg_ppl)
-                if i % args.print_batch == 0:
-                    print("step:%d ppl:%.3f" % (i, newest_ppl))
-
-            t1 = time.time()
-            total_time += t1 - t0
-            print("epoch:%d num_steps:%d time_cost(s):%f" %
-                  (epoch_idx, i, total_time / epoch_idx))
-            save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
-            feed_var_names = ["src_wordseq", "dst_wordseq"]
-            fetch_vars = [avg_cost, acc]
-            if args.trainer_id == 0:
-                fluid.io.save_inference_model(save_dir, feed_var_names,
-                                              fetch_vars, exe)
-                print("model saved in %s" % save_dir)
-        print("finish training")
-
-    if args.is_local:
-        print("run local training")
-        train_loop(fluid.default_main_program())
-    else:
-        print("run distribute training")
-        t = fluid.DistributeTranspiler()
-        t.transpile(
-            args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
-        if args.role == "pserver":
-            print("run psever")
-            pserver_prog = t.get_pserver_program(args.current_endpoint)
-            pserver_startup = t.get_startup_program(args.current_endpoint,
-                                                    pserver_prog)
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif args.role == "trainer":
-            print("run trainer")
-            train_loop(t.get_trainer_program())
-
-
-if __name__ == "__main__":
-    train()
diff --git a/PaddleRec/gru4rec/cluster_train.sh b/PaddleRec/gru4rec/cluster_train.sh
deleted file mode 100644
index 2711ffaddcb89ea7e53ba676261d9dfe392eda33..0000000000000000000000000000000000000000
--- a/PaddleRec/gru4rec/cluster_train.sh
+++ /dev/null
@@ -1,62 +0,0 @@
-#!/bin/bash
-
-#export GLOG_v=30
-#export GLOG_logtostderr=1
-
-# start pserver0
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --vocab_path vocab.txt \
-    --batch_size 5 \
-    --is_local 0 \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6000 \
-    --trainers 2 \
-    > pserver0.log 2>&1 &
-
-# start pserver1
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --vocab_path vocab.txt \
-    --batch_size 5 \
-    --is_local 0 \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6001 \
-    --trainers 2 \
-    > pserver1.log 2>&1 &
-
-# start trainer0
-#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --vocab_path vocab.txt \
-    --batch_size 5 \
-    --print_batch 10 \
-    --use_cuda 0 \
-    --is_local 0 \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 0 \
-    > trainer0.log 2>&1 &
-
-# start trainer1
-#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --vocab_path vocab.txt \
-    --batch_size 5 \
-    --print_batch 10 \
-    --use_cuda 0 \
-    --is_local 0 \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 1 \
-    > trainer1.log 2>&1 &
diff --git a/PaddleRec/gru4rec/utils.py b/PaddleRec/gru4rec/utils.py
index 424ebf78490a7b5220ec1a3c2a1b045f28638f35..d853770b76342f8b3d87fe7b3f9d54cf666675a4 100644
--- a/PaddleRec/gru4rec/utils.py
+++ b/PaddleRec/gru4rec/utils.py
@@ -103,7 +103,7 @@ def prepare_data(file_dir,
     if is_train and 'ce_mode' not in os.environ:
         vocab_size = get_vocab_size(vocab_path)
         reader = sort_batch(
-            paddle.reader.shuffle(
+            fluid.io.shuffle(
                 train(
                     file_dir, buffer_size, data_type=DataType.SEQ),
                 buf_size=buffer_size),
diff --git a/PaddleRec/multiview_simnet/infer.py b/PaddleRec/multiview_simnet/infer.py
index e9136588add2815c65e5b9e7e707de1f6fce8707..89ceb8cba1ade8f6a84aa6555f3bbeeaf3ff37e1 100644
--- a/PaddleRec/multiview_simnet/infer.py
+++ b/PaddleRec/multiview_simnet/infer.py
@@ -102,8 +102,8 @@ def parse_args():
 def start_infer(args, model_path):
     dataset = reader.SyntheticDataset(args.sparse_feature_dim, args.query_slots,
                                       args.title_slots)
-    test_reader = paddle.batch(
-        paddle.reader.shuffle(
+    test_reader = fluid.io.batch(
+        fluid.io.shuffle(
             dataset.valid(), buf_size=args.batch_size * 100),
         batch_size=args.batch_size)
     place = fluid.CPUPlace()
diff --git a/PaddleRec/multiview_simnet/nets.py b/PaddleRec/multiview_simnet/nets.py
index 104101e2b8f6548f4ddce289b97794d569eefc41..6a22820317a8f4073d7f79ffa644cbd0f07150fd 100644
--- a/PaddleRec/multiview_simnet/nets.py
+++ b/PaddleRec/multiview_simnet/nets.py
@@ -190,9 +190,8 @@ class MultiviewSimnet(object):
 
         # pairwise hinge_loss
         loss_part1 = fluid.layers.elementwise_sub(
-            tensor.fill_constant_batch_size_like(
-                input=cos_pos,
-                shape=[-1, 1],
+            fluid.layers.fill_constant(
+                shape=[fluid.layers.shape(cos_pos)[0], 1],
                 value=self.margin,
                 dtype='float32'),
             cos_pos)
@@ -200,8 +199,10 @@ class MultiviewSimnet(object):
         loss_part2 = fluid.layers.elementwise_add(loss_part1, cos_neg)
 
         loss_part3 = fluid.layers.elementwise_max(
-            tensor.fill_constant_batch_size_like(
-                input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
+            fluid.layers.fill_constant(
+                shape=[fluid.layers.shape(loss_part2)[0], 1],
+                value=0.0,
+                dtype='float32'),
             loss_part2)
 
         avg_cost = fluid.layers.mean(loss_part3)
diff --git a/PaddleRec/multiview_simnet/train.py b/PaddleRec/multiview_simnet/train.py
index fd8de5068b052a2dc12766395f804460d0936a41..95c9b24f4ed36e4696b44f55d6a56cb714183f5a 100644
--- a/PaddleRec/multiview_simnet/train.py
+++ b/PaddleRec/multiview_simnet/train.py
@@ -112,8 +112,8 @@ def start_train(args):
 
     dataset = reader.SyntheticDataset(args.sparse_feature_dim, args.query_slots,
                                       args.title_slots)
-    train_reader = paddle.batch(
-        paddle.reader.shuffle(
+    train_reader = fluid.io.batch(
+        fluid.io.shuffle(
             dataset.train(), buf_size=args.batch_size * 100),
         batch_size=args.batch_size)
     place = fluid.CPUPlace()
diff --git a/PaddleRec/ncf/evaluate.py b/PaddleRec/ncf/evaluate.py
index a8becd1b5019a057ea0cecabbec19138320cbc20..1a655e4548cd317734bc9228eb2405da6dca4a14 100644
--- a/PaddleRec/ncf/evaluate.py
+++ b/PaddleRec/ncf/evaluate.py
@@ -1,5 +1,5 @@
 import math
-import heapq # for retrieval topK
+import heapq  # for retrieval topK
 import multiprocessing
 import numpy as np
 from time import time
@@ -23,30 +23,36 @@ _K = None
 _args = None
 _model_path = None
 
+
 def run_infer(args, model_path, test_data_path):
     test_data_generator = utils.CriteoDataset()
-    
+
     with fluid.scope_guard(fluid.Scope()):
-        test_reader = paddle.batch(test_data_generator.test(test_data_path, False), batch_size=args.test_batch_size)
-            
+        test_reader = fluid.io.batch(
+            test_data_generator.test(test_data_path, False),
+            batch_size=args.test_batch_size)
+
         place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
         exe = fluid.Executor(place)
         exe.run(fluid.default_startup_program())
 
-        infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(model_path, exe)
+        infer_program, feed_target_names, fetch_vars = fluid.io.load_inference_model(
+            model_path, exe)
 
         for data in test_reader():
             user_input = np.array([dat[0] for dat in data])
             item_input = np.array([dat[1] for dat in data])
 
-            pred_val = exe.run(infer_program,
-                       feed={"user_input": user_input,
-                            "item_input": item_input},
-                       fetch_list=fetch_vars,
-                        return_numpy=True)
-        
+            pred_val = exe.run(
+                infer_program,
+                feed={"user_input": user_input,
+                      "item_input": item_input},
+                fetch_list=fetch_vars,
+                return_numpy=True)
+
             return pred_val[0].reshape(1, -1).tolist()[0]
 
+
 def evaluate_model(args, testRatings, testNegatives, K, model_path):
     """
     Evaluate the performance (Hit_Ratio, NDCG) of top-K recommendation
@@ -56,22 +62,23 @@ def evaluate_model(args, testRatings, testNegatives, K, model_path):
     global _testRatings
     global _testNegatives
     global _K
-    global _model_path 
+    global _model_path
     global _args
-    
+
     _args = args
-    _model_path= model_path
+    _model_path = model_path
     _testRatings = testRatings
     _testNegatives = testNegatives
     _K = K
-        
-    hits, ndcgs = [],[]
+
+    hits, ndcgs = [], []
     for idx in range(len(_testRatings)):
-        (hr,ndcg) = eval_one_rating(idx)
+        (hr, ndcg) = eval_one_rating(idx)
         hits.append(hr)
-        ndcgs.append(ndcg)      
+        ndcgs.append(ndcg)
     return (hits, ndcgs)
 
+
 def eval_one_rating(idx):
     rating = _testRatings[idx]
     items = _testNegatives[idx]
@@ -80,9 +87,9 @@ def eval_one_rating(idx):
     items.append(gtItem)
     # Get prediction scores
     map_item_score = {}
-    users = np.full(len(items), u, dtype = 'int32')
-    users = users.reshape(-1,1)
-    items_array = np.array(items).reshape(-1,1)
+    users = np.full(len(items), u, dtype='int32')
+    users = users.reshape(-1, 1)
+    items_array = np.array(items).reshape(-1, 1)
     temp = np.hstack((users, items_array))
     np.savetxt("Data/test.txt", temp, fmt='%d', delimiter=',')
     predictions = run_infer(_args, _model_path, _args.test_data_path)
@@ -91,7 +98,7 @@ def eval_one_rating(idx):
         item = items[i]
         map_item_score[item] = predictions[i]
     items.pop()
-    
+
     # Evaluate top rank list
     ranklist = heapq.nlargest(_K, map_item_score, key=map_item_score.get)
     hr = getHitRatio(ranklist, gtItem)
@@ -99,15 +106,17 @@ def eval_one_rating(idx):
 
     return (hr, ndcg)
 
+
 def getHitRatio(ranklist, gtItem):
     for item in ranklist:
         if item == gtItem:
             return 1
     return 0
 
+
 def getNDCG(ranklist, gtItem):
     for i in range(len(ranklist)):
         item = ranklist[i]
         if item == gtItem:
-            return math.log(2) / math.log(i+2)
+            return math.log(2) / math.log(i + 2)
     return 0
diff --git a/PaddleRec/ssr/README.md b/PaddleRec/ssr/README.md
index 6abc52405a3bf6a288bae2b3675d84fe33bd00ac..57b3503bc54d610d2e03894ba418faf404b18afe 100644
--- a/PaddleRec/ssr/README.md
+++ b/PaddleRec/ssr/README.md
@@ -43,9 +43,6 @@ cpu 单机多卡训练
 CPU_NUM=10 python train.py --train_dir train_data --use_cuda 0 --parallel 1 --batch_size 50 --model_dir model_output --num_devices 10
 ```
 
-本地模拟多机训练, 不支持windows.
-``` bash
-sh cluster_train.sh
 ```
 
 ## Inference
diff --git a/PaddleRec/ssr/cluster_train.py b/PaddleRec/ssr/cluster_train.py
deleted file mode 100644
index 7cbe7ab429993b9a1523ddff2cf9fa435710c9fb..0000000000000000000000000000000000000000
--- a/PaddleRec/ssr/cluster_train.py
+++ /dev/null
@@ -1,205 +0,0 @@
-#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import sys
-import time
-import argparse
-import logging
-import paddle.fluid as fluid
-import paddle
-import utils
-import numpy as np
-from nets import SequenceSemanticRetrieval
-
-logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s")
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser("sequence semantic retrieval")
-    parser.add_argument(
-        "--train_dir", type=str, default='train_data', help="Training file")
-    parser.add_argument(
-        "--base_lr", type=float, default=0.01, help="learning rate")
-    parser.add_argument(
-        '--vocab_path', type=str, default='vocab.txt', help='vocab file')
-    parser.add_argument(
-        "--epochs", type=int, default=10, help="Number of epochs")
-    parser.add_argument(
-        '--parallel', type=int, default=0, help='whether parallel')
-    parser.add_argument(
-        '--use_cuda', type=int, default=0, help='whether use gpu')
-    parser.add_argument(
-        '--print_batch', type=int, default=10, help='num of print batch')
-    parser.add_argument(
-        '--model_dir', type=str, default='model_output', help='model dir')
-    parser.add_argument(
-        "--hidden_size", type=int, default=128, help="hidden size")
-    parser.add_argument(
-        "--batch_size", type=int, default=50, help="number of batch")
-    parser.add_argument(
-        "--embedding_dim", type=int, default=128, help="embedding dim")
-    parser.add_argument(
-        '--num_devices', type=int, default=1, help='Number of GPU devices')
-    parser.add_argument(
-        '--step_num', type=int, default=1000, help='Number of steps')
-    parser.add_argument(
-        '--enable_ce',
-        action='store_true',
-        help='If set, run the task with continuous evaluation logs.')
-    parser.add_argument(
-        '--role', type=str, default='pserver', help='trainer or pserver')
-    parser.add_argument(
-        '--endpoints',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The current_endpoint')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='trainer id ,only trainer_id=0 save model')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trianers, (default: 1)')
-    return parser.parse_args()
-
-
-def get_cards(args):
-    return args.num_devices
-
-
-def train_loop(main_program, avg_cost, acc, train_input_data, place, args,
-               train_reader):
-    data_list = [var.name for var in train_input_data]
-    feeder = fluid.DataFeeder(feed_list=data_list, place=place)
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-    train_exe = exe
-
-    total_time = 0.0
-    ce_info = []
-    for pass_id in range(args.epochs):
-        epoch_idx = pass_id + 1
-        print("epoch_%d start" % epoch_idx)
-        t0 = time.time()
-        i = 0
-        for batch_id, data in enumerate(train_reader()):
-            i += 1
-            loss_val, correct_val = train_exe.run(
-                feed=feeder.feed(data), fetch_list=[avg_cost.name, acc.name])
-            ce_info.append(float(np.mean(correct_val)) / args.batch_size)
-            if i % args.print_batch == 0:
-                logger.info(
-                    "Train --> pass: {} batch_id: {} avg_cost: {}, acc: {}".
-                    format(pass_id, batch_id,
-                           np.mean(loss_val),
-                           float(np.mean(correct_val)) / args.batch_size))
-            if args.enable_ce and i > args.step_num:
-                break
-        t1 = time.time()
-        total_time += t1 - t0
-        print("epoch:%d num_steps:%d time_cost(s):%f" %
-              (epoch_idx, i, total_time / epoch_idx))
-        save_dir = "%s/epoch_%d" % (args.model_dir, epoch_idx)
-        fluid.save(fluid.default_main_program(), save_dir)
-        print("model saved in %s" % save_dir)
-
-    # only for ce
-    if args.enable_ce:
-        ce_acc = 0
-        try:
-            ce_acc = ce_info[-2]
-        except:
-            print("ce info error")
-        epoch_idx = args.epochs
-        device = get_device(args)
-        if args.use_cuda:
-            gpu_num = device[1]
-            print("kpis\teach_pass_duration_gpu%s\t%s" %
-                  (gpu_num, total_time / epoch_idx))
-            print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc))
-        else:
-            cpu_num = device[1]
-            threads_num = device[2]
-            print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
-                  (cpu_num, threads_num, total_time / epoch_idx))
-            print("kpis\ttrain_acc_cpu%s_thread%s\t%s" %
-                  (cpu_num, threads_num, ce_acc))
-
-
-def train(args):
-    if args.enable_ce:
-        SEED = 102
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
-    use_cuda = True if args.use_cuda else False
-    parallel = True if args.parallel else False
-    print("use_cuda:", use_cuda, "parallel:", parallel)
-    train_reader, vocab_size = utils.construct_train_data(
-        args.train_dir, args.vocab_path, args.batch_size * get_cards(args))
-    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    ssr = SequenceSemanticRetrieval(vocab_size, args.embedding_dim,
-                                    args.hidden_size)
-    # Train program
-    train_input_data, cos_pos, avg_cost, acc = ssr.train()
-
-    # Optimization to minimize lost
-    optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
-    optimizer.minimize(avg_cost)
-
-    print("run distribute training")
-    t = fluid.DistributeTranspiler()
-    t.transpile(
-        args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
-    if args.role == "pserver":
-        print("run psever")
-        pserver_prog = t.get_pserver_program(args.current_endpoint)
-        pserver_startup = t.get_startup_program(args.current_endpoint,
-                                                pserver_prog)
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif args.role == "trainer":
-        print("run trainer")
-        train_loop(t.get_trainer_program(), avg_cost, acc, train_input_data,
-                   place, args, train_reader)
-
-
-def get_device(args):
-    if args.use_cuda:
-        gpus = os.environ.get("CUDA_VISIBLE_DEVICES", 1)
-        gpu_num = len(gpus.split(','))
-        return "gpu", gpu_num
-    else:
-        threads_num = os.environ.get('NUM_THREADS', 1)
-        cpu_num = os.environ.get('CPU_NUM', 1)
-        return "cpu", int(cpu_num), int(threads_num)
-
-
-def main():
-    args = parse_args()
-    train(args)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PaddleRec/ssr/cluster_train.sh b/PaddleRec/ssr/cluster_train.sh
deleted file mode 100644
index aeb1d9c5cb102a511b0bc3485e6906f9d7985628..0000000000000000000000000000000000000000
--- a/PaddleRec/ssr/cluster_train.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-#export GLOG_v=30
-#export GLOG_logtostderr=1
-
-# start pserver0
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --vocab_path vocab.txt \
-    --batch_size 5 \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6000 \
-    --trainers 2 \
-    > pserver0.log 2>&1 &
-
-# start pserver1
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --vocab_path vocab.txt \
-    --batch_size 5 \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6001 \
-    --trainers 2 \
-    > pserver1.log 2>&1 &
-
-# start trainer0
-#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --vocab_path vocab.txt \
-    --batch_size 5 \
-    --print_batch 10 \
-    --use_cuda 0 \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 0 \
-    > trainer0.log 2>&1 &
-
-# start trainer1
-#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --vocab_path vocab.txt \
-    --batch_size 5 \
-    --print_batch 10 \
-    --use_cuda 0 \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 1 \
-    > trainer1.log 2>&1 &
diff --git a/PaddleRec/ssr/infer.py b/PaddleRec/ssr/infer.py
index c1bc8a7143768311cff03ae9ddf813bb495ecdd5..42a26f14f6cbf8f52479ee7c414a1f03d58170e0 100644
--- a/PaddleRec/ssr/infer.py
+++ b/PaddleRec/ssr/infer.py
@@ -37,14 +37,14 @@ def parse_args():
 
 
 def model(vocab_size, emb_size, hidden_size):
-    user_data = fluid.layers.data(
-        name="user", shape=[1], dtype="int64", lod_level=1)
-    all_item_data = fluid.layers.data(
-        name="all_item", shape=[vocab_size, 1], dtype="int64")
+    user_data = fluid.data(
+        name="user", shape=[None, 1], dtype="int64", lod_level=1)
+    all_item_data = fluid.data(
+        name="all_item", shape=[None, vocab_size], dtype="int64")
 
-    user_emb = fluid.layers.embedding(
+    user_emb = fluid.embedding(
         input=user_data, size=[vocab_size, emb_size], param_attr="emb.item")
-    all_item_emb = fluid.layers.embedding(
+    all_item_emb = fluid.embedding(
         input=all_item_data, size=[vocab_size, emb_size], param_attr="emb.item")
     all_item_emb_re = fluid.layers.reshape(x=all_item_emb, shape=[-1, emb_size])
 
@@ -63,7 +63,7 @@ def model(vocab_size, emb_size, hidden_size):
                                    bias_attr="item.b")
     cos_item = fluid.layers.cos_sim(X=all_item_hid, Y=user_re)
     all_pre_ = fluid.layers.reshape(x=cos_item, shape=[-1, vocab_size])
-    pos_label = fluid.layers.data(name="pos_label", shape=[1], dtype="int64")
+    pos_label = fluid.data(name="pos_label", shape=[None, 1], dtype="int64")
     acc = fluid.layers.accuracy(input=all_pre_, label=pos_label, k=20)
     return acc
 
@@ -94,7 +94,7 @@ def infer(args, vocab_size, test_reader):
                     user_data, pos_label = utils.infer_data(data, place)
                     all_item_numpy = np.tile(
                         np.arange(vocab_size), len(pos_label)).reshape(
-                            len(pos_label), vocab_size, 1).astype("int64")
+                            len(pos_label), vocab_size).astype("int64")
                     para = exe.run(copy_program,
                                    feed={
                                        "user": user_data,
diff --git a/PaddleRec/ssr/nets.py b/PaddleRec/ssr/nets.py
index 7b78adae3b45626f10f99b57654501b5f09f19a1..3026562d820effcbc4bf416a8e56d5e8b75c5924 100644
--- a/PaddleRec/ssr/nets.py
+++ b/PaddleRec/ssr/nets.py
@@ -57,13 +57,17 @@ class PairwiseHingeLoss(object):
 
     def forward(self, pos, neg):
         loss_part1 = fluid.layers.elementwise_sub(
-            tensor.fill_constant_batch_size_like(
-                input=pos, shape=[-1, 1], value=self.margin, dtype='float32'),
+            fluid.layers.fill_constant(
+                shape=[fluid.layers.shape(pos)[0], 1],
+                value=self.margin,
+                dtype='float32'),
             pos)
         loss_part2 = fluid.layers.elementwise_add(loss_part1, neg)
         loss_part3 = fluid.layers.elementwise_max(
-            tensor.fill_constant_batch_size_like(
-                input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
+            fluid.layers.fill_constant(
+                shape=[fluid.layers.shape(loss_part2)[0], 1],
+                value=0.0,
+                dtype='float32'),
             loss_part2)
         return loss_part3
 
diff --git a/PaddleRec/ssr/utils.py b/PaddleRec/ssr/utils.py
index 65571cb08a930d520c82c881bf5c68ca7c53b152..a5b6decf6dac7ff2af713d17262430ca8da9af1f 100644
--- a/PaddleRec/ssr/utils.py
+++ b/PaddleRec/ssr/utils.py
@@ -18,7 +18,7 @@ def construct_train_data(file_dir, vocab_path, batch_size):
     files = [file_dir + '/' + f for f in os.listdir(file_dir)]
     y_data = reader.YoochooseDataset(vocab_size)
     train_reader = fluid.io.batch(
-        paddle.reader.shuffle(
+        fluid.io.shuffle(
             y_data.train(files), buf_size=batch_size * 100),
         batch_size=batch_size)
     return train_reader, vocab_size
diff --git a/PaddleRec/tagspace/README.md b/PaddleRec/tagspace/README.md
index 1980b06dfd531da3db5691a204a3a0c088785e2d..2261e64f843431eb7eebd7eb4905e3d59396a7d2 100644
--- a/PaddleRec/tagspace/README.md
+++ b/PaddleRec/tagspace/README.md
@@ -9,8 +9,6 @@
 ├── infer.py             # 预测脚本
 ├── net.py               # 网络结构
 ├── text2paddle.py       # 文本数据转paddle数据
-├── cluster_train.py     # 多机训练
-├── cluster_train.sh     # 多机训练脚本
 ├── utils                # 通用函数
 ├── vocab_text.txt       # 小样本文本字典
 ├── vocab_tag.txt        # 小样本类别字典
@@ -89,9 +87,3 @@ python infer.py
 ```
 python infer.py --model_dir big_model --vocab_tag_path big_vocab_tag.txt --test_dir test_big_data/
 ```
-
-## 本地模拟多机
-运行命令
-```
-sh cluster_train.py
-```
diff --git a/PaddleRec/tagspace/cluster_train.py b/PaddleRec/tagspace/cluster_train.py
deleted file mode 100644
index 96cdf615ed9e6673159db5163f33f431d5c6e9bc..0000000000000000000000000000000000000000
--- a/PaddleRec/tagspace/cluster_train.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import os
-import sys
-import time
-import six
-import numpy as np
-import math
-import argparse
-import paddle
-import paddle.fluid as fluid
-import time
-import utils
-import net
-
-SEED = 102
-
-def parse_args():
-    parser = argparse.ArgumentParser("TagSpace benchmark.")
-    parser.add_argument(
-        '--neg_size', type=int, default=3, help='neg/pos ratio')
-    parser.add_argument(
-        '--train_dir', type=str, default='train_data', help='train file address')
-    parser.add_argument(
-        '--vocab_text_path', type=str, default='vocab_text.txt', help='vocab_text file address')
-    parser.add_argument(
-        '--vocab_tag_path', type=str, default='vocab_tag.txt', help='vocab_text file address')
-    parser.add_argument(
-        '--is_local', type=int, default=1, help='whether local')
-    parser.add_argument(
-        '--model_dir', type=str, default='model_', help='model dir')
-    parser.add_argument(
-        '--batch_size', type=int, default=5, help='num of batch size')
-    parser.add_argument(
-        '--print_batch', type=int, default=10, help='num of print batch')
-    parser.add_argument(
-        '--pass_num', type=int, default=10, help='num of epoch')
-    parser.add_argument(
-        '--use_cuda', type=int, default=0, help='whether use gpu')
-    parser.add_argument(
-        '--base_lr', type=float, default=0.01, help='learning rate')
-    parser.add_argument(
-        '--num_devices', type=int, default=1, help='Number of GPU devices')
-    parser.add_argument(
-        '--role', type=str, default='pserver', help='trainer or pserver')
-    parser.add_argument(
-        '--endpoints', type=str, default='127.0.0.1:6000', help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint', type=str, default='127.0.0.1:6000', help='The current_endpoint')
-    parser.add_argument(
-        '--trainer_id', type=int, default=0, help='trainer id ,only trainer_id=0 save model')
-    parser.add_argument(
-        '--trainers', type=int, default=1, help='The num of trianers, (default: 1)')
-    args = parser.parse_args()
-    return args
-
-def get_cards(args):
-    return args.num_devices
-
-def train():
-    """ do training """
-    args = parse_args()
-    train_dir = args.train_dir
-    vocab_text_path = args.vocab_text_path
-    vocab_tag_path = args.vocab_tag_path
-    use_cuda = True if args.use_cuda else False
-    batch_size = args.batch_size
-    neg_size = args.neg_size
-    vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data(
-        file_dir=train_dir, vocab_text_path=vocab_text_path, 
-        vocab_tag_path=vocab_tag_path, neg_size=neg_size, 
-        batch_size=batch_size * get_cards(args), 
-        buffer_size=batch_size*100, is_train=True)
-    """ train network """
-    # Train program
-    avg_cost, correct, cos_pos = net.network(vocab_text_size, vocab_tag_size, neg_size=neg_size)
-
-    # Optimization to minimize lost
-    sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_lr)
-    sgd_optimizer.minimize(avg_cost)
-
-    def train_loop(main_program):
-        # Initialize executor
-        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        pass_num = args.pass_num
-        model_dir = args.model_dir
-        fetch_list = [avg_cost.name]
-        exe.run(fluid.default_startup_program())
-        total_time = 0.0
-        for pass_idx in range(pass_num):
-            epoch_idx = pass_idx + 1
-            print("epoch_%d start" % epoch_idx)
-            t0 = time.time()
-            for batch_id, data in enumerate(train_reader()):
-                lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
-                lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
-                lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
-                loss_val, correct_val = exe.run(
-                        feed={
-                            "text": lod_text_seq,
-                            "pos_tag": lod_pos_tag,
-                            "neg_tag": lod_neg_tag},
-                        fetch_list=[avg_cost.name, correct.name])
-                if batch_id % args.print_batch == 0:
-                    print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}"
-                            .format(pass_idx, (batch_id+10) * batch_size, np.mean(loss_val),
-                                    float(np.sum(correct_val)) / batch_size))
-            t1 = time.time()
-            total_time += t1 - t0
-            print("epoch:%d num_steps:%d time_cost(s):%f" %
-                  (epoch_idx, batch_id, total_time / epoch_idx))
-            save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
-            feed_var_names = ["text", "pos_tag"]
-            fetch_vars = [cos_pos]
-            fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars, exe)
-        print("finish training")
-    
-    if args.is_local:
-        print("run local training")
-        train_loop(fluid.default_main_program())
-    else:
-        print("run distribute training")
-        t = fluid.DistributeTranspiler()
-        t.transpile(args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
-        if args.role == "pserver":
-            print("run psever")
-            pserver_prog = t.get_pserver_program(args.current_endpoint)
-            pserver_startup = t.get_startup_program(args.current_endpoint,
-                                                    pserver_prog)
-            exe = fluid.Executor(fluid.CPUPlace())
-            exe.run(pserver_startup)
-            exe.run(pserver_prog)
-        elif args.role == "trainer":
-            print("run trainer")
-            train_loop(t.get_trainer_program())
-
-if __name__ == "__main__":
-    train()
diff --git a/PaddleRec/tagspace/cluster_train.sh b/PaddleRec/tagspace/cluster_train.sh
deleted file mode 100644
index 16698e1d4acf06bd5bb06f9aceb7b3b135b5f234..0000000000000000000000000000000000000000
--- a/PaddleRec/tagspace/cluster_train.sh
+++ /dev/null
@@ -1,58 +0,0 @@
-#!/bin/bash
-
-#export GLOG_v=30
-#export GLOG_logtostderr=1
-
-# start pserver0
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --batch_size 5 \
-    --is_local 0 \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6000 \
-    --trainers 2 \
-    > pserver0.log 2>&1 &
-
-# start pserver1
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --batch_size 5 \
-    --is_local 0 \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6001 \
-    --trainers 2 \
-    > pserver1.log 2>&1 &
-
-# start trainer0
-#CUDA_VISIBLE_DEVICES=1 python cluster_train.py \
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --batch_size 5 \
-    --print_batch 10 \
-    --use_cuda 0 \
-    --is_local 0 \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 0 \
-    > trainer0.log 2>&1 &
-
-# start trainer1
-#CUDA_VISIBLE_DEVICES=2 python cluster_train.py \
-python cluster_train.py \
-    --train_dir train_data \
-    --model_dir cluster_model \
-    --batch_size 5 \
-    --print_batch 10 \
-    --use_cuda 0 \
-    --is_local 0 \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 1 \
-    > trainer1.log 2>&1 &
diff --git a/PaddleRec/tagspace/net.py b/PaddleRec/tagspace/net.py
index 479d6620aaf1ddf3b6ca5decf56f85e604bd0878..3a8287e723fe3316c963173cab61d92ff832054c 100644
--- a/PaddleRec/tagspace/net.py
+++ b/PaddleRec/tagspace/net.py
@@ -46,13 +46,17 @@ def network(vocab_text_size,
     cos_neg = nn.reduce_max(cos_neg_all, dim=1, keep_dim=True)
     #calculate hinge loss
     loss_part1 = nn.elementwise_sub(
-        tensor.fill_constant_batch_size_like(
-            input=cos_pos, shape=[-1, 1], value=margin, dtype='float32'),
+        fluid.layers.fill_constant(
+            shape=[fluid.layers.shape(cos_pos)[0], 1],
+            value=margin,
+            dtype='float32'),
         cos_pos)
     loss_part2 = nn.elementwise_add(loss_part1, cos_neg)
     loss_part3 = nn.elementwise_max(
-        tensor.fill_constant_batch_size_like(
-            input=loss_part2, shape=[-1, 1], value=0.0, dtype='float32'),
+        fluid.layers.fill_constant(
+            shape=[fluid.layers.shape(loss_part2)[0], 1],
+            value=0.0,
+            dtype='float32'),
         loss_part2)
     avg_cost = nn.mean(loss_part3)
     less = tensor.cast(cf.less_than(cos_neg, cos_pos), dtype='float32')
diff --git a/PaddleRec/tagspace/utils.py b/PaddleRec/tagspace/utils.py
index 7ae71249e41ec2d6f42be07f3a5a472a1d3ba8a2..5730efeaa41c3dfaca641f700e45b2fe212e3c5e 100644
--- a/PaddleRec/tagspace/utils.py
+++ b/PaddleRec/tagspace/utils.py
@@ -65,7 +65,7 @@ def prepare_data(file_dir,
         vocab_text_size = get_vocab_size(vocab_text_path)
         vocab_tag_size = get_vocab_size(vocab_tag_path)
         reader = sort_batch(
-            paddle.reader.shuffle(
+            fluid.io.shuffle(
                 train(
                     file_dir,
                     vocab_tag_size,
diff --git a/PaddleRec/tdm/tdm_demo/README.md b/PaddleRec/tdm/tdm_demo/README.md
index 8e9b7f4cbf8223fa550a5405b78d5ca96d0799ce..42b360830dc8457b4c9d4dcd351799dd0ee289a6 100644
--- a/PaddleRec/tdm/tdm_demo/README.md
+++ b/PaddleRec/tdm/tdm_demo/README.md
@@ -51,7 +51,7 @@
 - **Node-Embedding**：注意，此处的Embedding，并非我们已有的item-embedding，而是构建完成的树的节点对应的Embedding，由item-embedding通过规则生成，是我们的网络主要训练的目标。ID范围为所有0->节点数-1。我们同时也需准备一个映射表，来告诉模型，item_id到node_id的映射关系。
 - **Travel**：是指叶子节点从root开始直到其自身的遍历路径，如上图，14号节点的Travel：0->1->3->7->14
 - **Layer**：指树的层，如上图，共有4层。
-  
+
 > Paddle-TDM在训练时，不会改动树的结构，只会改动Node-Embedding。
 
 
@@ -156,7 +156,7 @@ TDM的组网，宏观上，可以概括为三个部分
 
 **demo模型，假设输入为两个元素：**
 > 一、user/query的emb表示，该emb应该来源于特征的组合在某个空间的映射（比如若干特征取emb后concat到一起），或其他预训练模型的处理结果（比如将明文query通过nlp预处理得到emb表示）
-    
+
 > 二、item的正样本，是发生了实际点击/购买/浏览等行为的item_id，与输入的user/query emb强相关，是我们之后通过预测想得到的结果。
 
 在paddle组网中，我们这样定义上面两个变量：
@@ -233,9 +233,9 @@ tdm_sampler的运行逻辑如下：
     - 在item遍历路径上的node视为正样本，`positive_node_id`由`travel_list[item_id][i]`给出，其他同层的兄弟节点视为负样本，该层节点列表由`layer_list[i]`给出，如果`positive_node_id`不在`layer_list[i]`中，会提示错误。
 
     - 在兄弟节点中进行随机采样，采样N个node，N由`neg_sampling_list[i]`的值决定，如果该值大于兄弟节点的数量，会提示错误。 采样结果不会重复，且不会采样到正样本。
-    
+
     - 如果`output_positive=True`，则会同时输出正负样本，否则只输出负采样的结果
-    
+
     - 生成该层`label`，shape与采样结果一致，正样本对应的label=1，负样本的label=0
 
     - 生成该层`mask`，如果树是不平衡的，则有些item不会位于树的最后一层，所以遍历路径的实际长度会比其他item少，为了tensor维度一致，travel_list中padding了0。当遇到了padding的0时，tdm_sampler也会输出正常维度的采样结果，采样结果与label都为0。为了区分这部分虚拟的采样结果与真实采样结果，会给虚拟采样结果额外设置mask=0，如果是真实采样结果mask=1
@@ -403,23 +403,23 @@ acc = fluid.layers.accuracy(input=softmax_prob, label=labels_reshape)
 在demo网络中，我们设置为从某一层的所有节点开始进行检索。paddle组网对输入定义的实现如下：
 ```python
 def input_data(self):
-    input_emb = fluid.layers.data(
+    input_emb = fluid.data(
         name="input_emb",
-        shape=[self.input_embed_size],
+        shape=[None, self.input_embed_size],
         dtype="float32",
     )
 
     # first_layer 与 first_layer_mask 对应着infer起始的节点
-    first_layer = fluid.layers.data(
+    first_layer = fluid.data(
         name="first_layer_node",
-        shape=[1],
+        shape=[None, 1],
         dtype="int64",
         lod_level=1, #支持变长
     )
 
-    first_layer_mask = fluid.layers.data(
+    first_layer_mask = fluid.data(
         name="first_layer_node_mask",
-        shape=[1],
+        shape=[None, 1],
         dtype="int64",
         lod_level=1,
     )
@@ -447,7 +447,7 @@ def create_first_layer(self, args):
 tdm的检索逻辑类似beamsearch，简单来说：在每一层计算打分，得到topK的节点，将这些节点的孩子节点作为下一层的输入，如此循环，得到最终的topK。但仍然有一些需要注意的细节，下面将详细介绍。
 
 - 问题一：怎么处理`input_emb`？
-  
+
   - input_emb过`input_fc`，检索中，只需过一次即可:
   ```python
   nput_trans_emb = self.input_trans_net.input_fc_infer(input_emb)
@@ -663,7 +663,7 @@ if args.save_init_model or not args.load_model:
 ```
 
 > 为什么每次加载模型手动Set `learning rate`？
-> 
+>
 > 学习率在paddle的组网中，是以一个`persistable=Ture`的长期变量储存在模型的Variable scope里的。每次使用load_persistables加载模型时，也会使用加载的模型的学习率覆盖本地模型的默认学习率，换言之，加载init_model以后，学习率也是保存init_model时的学习率。对模型的调试会产生不必要的影响，为了保证网络训练如预期，需要这样的手动set步骤。
 
 ### demo的训练运行方法
@@ -706,4 +706,4 @@ Demo代码中给出了基于paddle预测库加载tdm模型，输入emb产出item
     ```
 - 首先需要运行`run_infer.sh`，打开`save_init_model`开关，使用`save_inference_model`产出paddle的推理模型，`predict.py`会加载`infer_model`，进行高速推理。
 - 欲想进一步高速推理，需使用含预测库的paddle预测库，可以使用`mkl`及`mkl_dnn`等计算库加速op的计算。相关文档可以参考：[服务器端部署](https://www.paddlepaddle.org.cn/documentation/docs/zh/advanced_guide/inference_deployment/inference/index_cn.html)
-- tdm相关op目前仅支持在cpu设备上运行，后续会支持GPU，欢迎关注。
\ No newline at end of file
+- tdm相关op目前仅支持在cpu设备上运行，后续会支持GPU，欢迎关注。
diff --git a/PaddleRec/tdm/tdm_demo/dataset_generator.py b/PaddleRec/tdm/tdm_demo/dataset_generator.py
index a600ed9221e37ccfb3aea1e5ed8f6f35556298f8..b186a1ae28b5e4cd34da6e2faeaf58c1572be300 100644
--- a/PaddleRec/tdm/tdm_demo/dataset_generator.py
+++ b/PaddleRec/tdm/tdm_demo/dataset_generator.py
@@ -35,6 +35,7 @@ class TDMDataset(dg.MultiSlotStringDataGenerator):
         """
             Read test_data line by line & yield batch
             """
+
         def local_iter():
             """Read file line by line"""
             for fname in infer_file_list:
@@ -46,13 +47,14 @@ class TDMDataset(dg.MultiSlotStringDataGenerator):
                         yield [input_emb]
 
         import paddle
-        batch_iter = paddle.batch(local_iter, batch)
+        batch_iter = fluid.io.batch(local_iter, batch)
         return batch_iter
 
     def generate_sample(self, line):
         """
         Read the data line by line and process it as a dictionary
         """
+
         def iterator():
             """
             This function needs to be implemented by the user, based on data format
diff --git a/PaddleRec/tdm/tdm_demo/infer_network.py b/PaddleRec/tdm/tdm_demo/infer_network.py
index 8c7a41b853fcf69b94efbbc6ddb8e8723b11faec..a78364d0cdf1ba12f5219bbb941cde9ada297c73 100644
--- a/PaddleRec/tdm/tdm_demo/infer_network.py
+++ b/PaddleRec/tdm/tdm_demo/infer_network.py
@@ -41,26 +41,23 @@ class TdmInferNet(object):
         self.input_trans_net = InputTransNet(args)
 
     def input_data(self):
-        input_emb = fluid.layers.data(
+        input_emb = fluid.data(
             name="input_emb",
-            shape=[self.input_embed_size],
-            dtype="float32",
-        )
+            shape=[None, self.input_embed_size],
+            dtype="float32", )
 
         # first_layer 与 first_layer_mask 对应着infer起始层的节点
-        first_layer = fluid.layers.data(
+        first_layer = fluid.data(
             name="first_layer_node",
-            shape=[1],
+            shape=[None, 1],
             dtype="int64",
-            lod_level=1,
-        )
+            lod_level=1, )
 
-        first_layer_mask = fluid.layers.data(
+        first_layer_mask = fluid.data(
             name="first_layer_node_mask",
-            shape=[1],
+            shape=[None, 1],
             dtype="int64",
-            lod_level=1,
-        )
+            lod_level=1, )
 
         inputs = [input_emb] + [first_layer] + [first_layer_mask]
         return inputs
@@ -125,28 +122,27 @@ class TdmInferNet(object):
                 size=[self.node_nums, self.node_embed_size],
                 param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))
 
-            input_fc_out = self.input_trans_net.layer_fc_infer(
-                input_trans_emb, layer_idx)
+            input_fc_out = self.input_trans_net.layer_fc_infer(input_trans_emb,
+                                                               layer_idx)
 
             # 过每一层的分类器
-            layer_classifier_res = self.layer_classifier.classifier_layer_infer(input_fc_out,
-                                                                                node_emb,
-                                                                                layer_idx)
+            layer_classifier_res = self.layer_classifier.classifier_layer_infer(
+                input_fc_out, node_emb, layer_idx)
 
             # 过最终的判别分类器
-            tdm_fc = fluid.layers.fc(input=layer_classifier_res,
-                                     size=self.label_nums,
-                                     act=None,
-                                     num_flatten_dims=2,
-                                     param_attr=fluid.ParamAttr(
-                                         name="tdm.cls_fc.weight"),
-                                     bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
+            tdm_fc = fluid.layers.fc(
+                input=layer_classifier_res,
+                size=self.label_nums,
+                act=None,
+                num_flatten_dims=2,
+                param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
+                bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
 
             prob = fluid.layers.softmax(tdm_fc)
             positive_prob = fluid.layers.slice(
                 prob, axes=[2], starts=[1], ends=[2])
-            prob_re = fluid.layers.reshape(
-                positive_prob, [-1, current_layer_node_num])
+            prob_re = fluid.layers.reshape(positive_prob,
+                                           [-1, current_layer_node_num])
 
             # 过滤掉padding产生的无效节点（node_id=0）
             node_zero_mask = fluid.layers.cast(current_layer_node, 'bool')
@@ -161,11 +157,10 @@ class TdmInferNet(object):
 
             # index_sample op根据下标索引tensor对应位置的值
             # 若paddle版本>2.0，调用方式为paddle.index_sample
-            top_node = fluid.contrib.layers.index_sample(
-                current_layer_node, topk_i)
+            top_node = fluid.contrib.layers.index_sample(current_layer_node,
+                                                         topk_i)
             prob_re_mask = prob_re * current_layer_child_mask  # 过滤掉非叶子节点
-            topk_value = fluid.contrib.layers.index_sample(
-                prob_re_mask, topk_i)
+            topk_value = fluid.contrib.layers.index_sample(prob_re_mask, topk_i)
             node_score.append(topk_value)
             node_list.append(top_node)
 
@@ -190,7 +185,8 @@ class TdmInferNet(object):
         res_node = fluid.layers.reshape(res_layer_node, [-1, self.topK, 1])
 
         # 利用Tree_info信息，将node_id转换为item_id
-        tree_info = fluid.default_main_program().global_block().var("TDM_Tree_Info")
+        tree_info = fluid.default_main_program().global_block().var(
+            "TDM_Tree_Info")
         res_node_emb = fluid.layers.gather_nd(tree_info, res_node)
 
         res_item = fluid.layers.slice(
diff --git a/PaddleRec/text_matching_on_quora/.run_ce.sh b/PaddleRec/text_matching_on_quora/.run_ce.sh
deleted file mode 100755
index f1bb7febd3f2c572544612baf24be14c711108e3..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/.run_ce.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-export MKL_NUM_THREADS=1
-export OMP_NUM_THREADS=1
-
-cudaid=${text_matching_on_quora:=0} # use 0-th card as default
-export CUDA_VISIBLE_DEVICES=$cudaid
-
-FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce --epoch_num=5 | python _ce.py
-
-cudaid=${text_matching_on_quora_m:=0,1,2,3} # use 0,1,2,3 card as default
-export CUDA_VISIBLE_DEVICES=$cudaid
-
-FLAGS_benchmark=true  python train_and_evaluate.py --model_name=cdssmNet --config=cdssm_base --enable_ce --epoch_num=5 | python _ce.py
diff --git a/PaddleRec/text_matching_on_quora/README.md b/PaddleRec/text_matching_on_quora/README.md
deleted file mode 100644
index eb1346b671e7873918d171cfb2c4993367c9d781..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/README.md
+++ /dev/null
@@ -1,177 +0,0 @@
-# Text matching on Quora qestion-answer pair dataset
-
-## contents
-
-* [Introduction](#introduction)
-  * [a brief review of the Quora Question Pair (QQP) Task](#a-brief-review-of-the-quora-question-pair-qqp-task)
-  * [Our Work](#our-work)
-* [Environment Preparation](#environment-preparation)
-  * [Install Fluid release 1.0](#install-fluid-release-10)
-    * [cpu version](#cpu-version)
-    * [gpu version](#gpu-version)
-    * [Have I installed Fluid successfully?](#have-i-installed-fluid-successfully)
-* [Prepare Data](#prepare-data)
-* [Train and evaluate](#train-and-evaluate)
-* [Models](#models)
-* [Results](#results)
-
-
-## Introduction
-
-### a brief review of the Quora Question Pair (QQP) Task
-
-The [Quora Question Pair](https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs) dataset contains 400,000 question pairs from [Quora](https://www.quora.com/), where people ask and answer questions related to specific areas. Each sample in the dataset consists of two questions (both English) and a label that represents whether the questions are duplicate. The dataset is well annotated by human.
-
-Below are two samples from the dataset. The last column indicates whether the two questions are duplicate (1) or not (0).
-
-|id | qid1 | qid2| question1| question2| is_duplicate
-|:---:|:---:|:---:|:---:|:---:|:---:|
-|0 |1 |2 |What is the step by step guide to invest in share market in india? |What is the step by step guide to invest in share market? |0|
-|1 |3 |4 |What is the story of Kohinoor (Koh-i-Noor) Diamond? | What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back? |0|
-
- A [kaggle competition](https://www.kaggle.com/c/quora-question-pairs#description) was held based on this dataset in 2017. The kagglers were given a training dataset (with labels), and requested to make predictions on a test dataset (without labels). The predictions were evaluated by the log-likelihood loss on the test data.
-
-The kaggle competition has inspired much effective work. However, most of these models are rule-based and difficult to be transferred to new tasks. Researchers are seeking for more general models that work well on this task and other natual language processing (NLP) tasks.
-
-[Wang _et al._](https://arxiv.org/abs/1702.03814) proposed a bilateral multi-perspective matching (BIMPM) model based on the Quora Question Pair dataset. They splitted the original dataset to [3 parts](https://drive.google.com/file/d/0B0PlTAo--BnaQWlsZl9FZ3l1c28/view?usp=sharing): _train.tsv_ (384,348 samples), _dev.tsv_ (10,000 samples) and _test.tsv_ (10,000 samples). The class distribution of _train.tsv_ is unbalanced (37% positive and 63% negative), while those of _dev.tsv_ and _test.tsv_ are balanced(50% positive and 50% negetive). We used the same splitting method in our experiments.
-
-### Our Work
-
-Based on the Quora Question Pair Dataset, we implemented some classic models in the area of neural language understanding (NLU). The accuracy of prediction results are evaluated on the _test.tsv_ from [Wang _et al._](https://arxiv.org/abs/1702.03814).
-
-## Environment Preparation
-
-### Install Fluid release 1.0
-
-Please follow the [official document in English](http://www.paddlepaddle.org/documentation/docs/en/1.0/build_and_install/pip_install_en.html) or [official document in Chinese](http://www.paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/install/Start.html) to install the Fluid deep learning framework.
-
-#### Have I installed Fluid successfully?
-
-Run the following script from your command line:
-
-```shell
-python -c "import paddle"
-```
-
-If Fluid is installed successfully you should see no error message. Feel free to open issues under the [PaddlePaddle repository](https://github.com/PaddlePaddle/Paddle/issues) for support.
-
-## Prepare Data
-
-Please download the Quora dataset from [Google drive](https://drive.google.com/file/d/0B0PlTAo--BnaQWlsZl9FZ3l1c28/view?usp=sharing) and unzip to $HOME/.cache/paddle/dataset.
-
-Then run _data/prepare_quora_data.sh_ to download the pre-trained _word2vec_ embedding file -- _glove.840B.300d.zip_:
-
-```shell
-sh data/prepare_quora_data.sh  
-```
-
-At this point the dataset directory ($HOME/.cache/paddle/dataset) structure should be:
-
-```shell
-
-$HOME/.cache/paddle/dataset
-    |- Quora_question_pair_partition
-        |- train.tsv
-        |- test.tsv
-        |- dev.tsv
-        |- readme.txt
-        |- wordvec.txt
-    |- glove.840B.300d.txt
-```
-
-## Train and evaluate
-
-We provide multiple models and configurations. Details are shown in `models` and `configs` directories. For a quick start, please run the _cdssmNet_ model with the corresponding configuration:
-
-```shell
-python train_and_evaluate.py  \
-    --model_name=cdssmNet  \
-    --config=cdssm_base
-```
-
-Logs will be output to the console. If everything works well, the logging information will have the same formats as the content in _cdssm_base.log_.
-
-All configurations used in our experiments are as follows:
-
-|Model|Config|command
-|:----:|:----:|:----:|
-|cdssmNet|cdssm_base|python train_and_evaluate.py  --model_name=cdssmNet  --config=cdssm_base
-|DecAttNet|decatt_glove|python train_and_evaluate.py --model_name=DecAttNet  --config=decatt_glove
-|InferSentNet|infer_sent_v1|python train_and_evaluate.py --model_name=InferSentNet --config=infer_sent_v1
-|InferSentNet|infer_sent_v2|python train_and_evaluate.py --model_name=InferSentNet --config=infer_sent_v2
-|SSENet|sse_base|python train_and_evaluate.py  --model_name=SSENet  --config=sse_base
-
-## Models
-
-We implemeted 4 models for now: the convolutional deep-structured semantic model (CDSSM, CNN-based), the InferSent model (RNN-based), the shortcut-stacked encoder (SSE, RNN-based), and the decomposed attention model (DecAtt, attention-based).
-
-|Model|features|Context Encoder|Match Layer|Classification Layer
-|:----:|:----:|:----:|:----:|:----:|
-|CDSSM|word|1 layer conv1d|concatenation|MLP
-|DecAtt|word|Attention|concatenation|MLP
-|InferSent|word|1 layer Bi-LSTM|concatenation/element-wise product/<br>absolute element-wise difference|MLP
-|SSE|word|3 layer Bi-LSTM|concatenation/element-wise product/<br>absolute element-wise difference|MLP
-
-### CDSSM
-
-```
-@inproceedings{shen2014learning,
-  title={Learning semantic representations using convolutional neural networks for web search},
-  author={Shen, Yelong and He, Xiaodong and Gao, Jianfeng and Deng, Li and Mesnil, Gr{\'e}goire},
-  booktitle={Proceedings of the 23rd International Conference on World Wide Web},
-  pages={373--374},
-  year={2014},
-  organization={ACM}
-}
-```
-
-### InferSent
-
-```
-@article{conneau2017supervised,
-  title={Supervised learning of universal sentence representations from natural language inference data},
-  author={Conneau, Alexis and Kiela, Douwe and Schwenk, Holger and Barrault, Loic and Bordes, Antoine},
-  journal={arXiv preprint arXiv:1705.02364},
-  year={2017}
-}
-```
-
-### SSE
-
-```
-@article{nie2017shortcut,
-  title={Shortcut-stacked sentence encoders for multi-domain inference},
-  author={Nie, Yixin and Bansal, Mohit},
-  journal={arXiv preprint arXiv:1708.02312},
-  year={2017}
-}
-```
-
-### DecAtt
-
-```
-@article{tomar2017neural,
-  title={Neural paraphrase identification of questions with noisy pretraining},
-  author={Tomar, Gaurav Singh and Duque, Thyago and T{\"a}ckstr{\"o}m, Oscar and Uszkoreit, Jakob and Das, Dipanjan},
-  journal={arXiv preprint arXiv:1704.04565},
-  year={2017}
-}
-```
-
-## Results
-
-|Model|Config|dev accuracy| test accuracy
-|:----:|:----:|:----:|:----:|
-|cdssmNet|cdssm_base|83.56%|82.83%|
-|DecAttNet|decatt_glove|86.31%|86.22%|
-|InferSentNet|infer_sent_v1|87.15%|86.62%|
-|InferSentNet|infer_sent_v2|88.55%|88.43%|
-|SSENet|sse_base|88.35%|88.25%|
-
-In our experiment, we found that LSTM-based models outperformed convolution-based models. The DecAtt model has fewer parameters than LSTM-based models, but is sensitive to hyper-parameters.
-
-<p align="center">
-
- <img src="imgs/models_test_acc.png" width = "500" alt="test_acc"/>
-
-</p>
diff --git a/PaddleRec/text_matching_on_quora/__init__.py b/PaddleRec/text_matching_on_quora/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/PaddleRec/text_matching_on_quora/_ce.py b/PaddleRec/text_matching_on_quora/_ce.py
deleted file mode 100644
index 930d7142dff933c8cd6f9139d5b3c5847931e43d..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/_ce.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# this file is only used for continuous evaluation test!
-
-import os
-import sys
-sys.path.append(os.environ['ceroot'])
-from kpi import CostKpi
-from kpi import DurationKpi
-
-each_pass_duration_card1_kpi = DurationKpi(
-    'each_pass_duration_card1', 0.08, 0, actived=True)
-train_avg_cost_card1_kpi = CostKpi('train_avg_cost_card1', 0.08, 0)
-train_avg_acc_card1_kpi = CostKpi('train_avg_acc_card1', 0.02, 0)
-each_pass_duration_card4_kpi = DurationKpi(
-    'each_pass_duration_card4', 0.08, 0, actived=True)
-train_avg_cost_card4_kpi = CostKpi('train_avg_cost_card4', 0.08, 0)
-train_avg_acc_card4_kpi = CostKpi('train_avg_acc_card4', 0.02, 0)
-
-tracking_kpis = [
-    each_pass_duration_card1_kpi,
-    train_avg_cost_card1_kpi,
-    train_avg_acc_card1_kpi,
-    each_pass_duration_card4_kpi,
-    train_avg_cost_card4_kpi,
-    train_avg_acc_card4_kpi,
-]
-
-
-def parse_log(log):
-    '''
-    This method should be implemented by model developers.
-
-    The suggestion:
-
-    each line in the log should be key, value, for example:
-
-    "
-    train_cost\t1.0
-    test_cost\t1.0
-    train_cost\t1.0
-    train_cost\t1.0
-    train_acc\t1.2
-    "
-    '''
-    for line in log.split('\n'):
-        fs = line.strip().split('\t')
-        print(fs)
-        if len(fs) == 3 and fs[0] == 'kpis':
-            kpi_name = fs[1]
-            kpi_value = float(fs[2])
-            yield kpi_name, kpi_value
-
-
-def log_to_ce(log):
-    kpi_tracker = {}
-    for kpi in tracking_kpis:
-        kpi_tracker[kpi.name] = kpi
-
-    for (kpi_name, kpi_value) in parse_log(log):
-        print(kpi_name, kpi_value)
-        kpi_tracker[kpi_name].add_record(kpi_value)
-        kpi_tracker[kpi_name].persist()
-
-
-if __name__ == '__main__':
-    log = sys.stdin.read()
-    log_to_ce(log)
diff --git a/PaddleRec/text_matching_on_quora/cdssm_base.log b/PaddleRec/text_matching_on_quora/cdssm_base.log
deleted file mode 100644
index ec29816153289d9b35ce6ff86e052690dfcf2c22..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/cdssm_base.log
+++ /dev/null
@@ -1,1834 +0,0 @@
-net_name:  cdssmNet
-config {'save_dirname': 'model_dir', 'optimizer_type': 'adam', 'duplicate_data': False, 'train_samples_num': 384348, 'droprate_fc': 0.1, 'fc_dim': 128, 'kernel_count': 300, 'mlp_hid_dim': [128, 128], 'OOV_fill': 'uniform', 'class_dim': 2, 'epoch_num': 50, 'lr_decay': 1, 'learning_rate': 0.001, 'batch_size': 128, 'use_lod_tensor': True, 'metric_type': ['accuracy'], 'embedding_norm': False, 'emb_dim': 300, 'droprate_conv': 0.1, 'use_pretrained_word_embedding': True, 'kernel_size': 5, 'dict_dim': 40000}
-Generating word dict...
-('Vocab size: ', 36057)
-loading word2vec from  /home/dongdaxiang/.cache/paddle/dataset/glove.840B.300d.txt
-preparing pretrained word embedding ...
-pretrained_word_embedding to be load: [[-0.086864    0.19161     0.10915    ... -0.01516     0.11108
-   0.2065    ]
- [ 0.27204    -0.06203    -0.1884     ...  0.13015    -0.18317
-   0.1323    ]
- [-0.20628     0.36716    -0.071933   ...  0.14271     0.50059
-   0.038025  ]
- ...
- [ 0.03847164  0.01711482  0.01181574 ...  0.03926358 -0.04032813
-  -0.02135365]
- [ 0.04201478 -0.02560226 -0.02281064 ...  0.00920258  0.04321
-   0.0227482 ]
- [-0.04984529 -0.00176931  0.03022346 ...  0.0298265   0.02384543
-   0.00974313]]
-param name: emb.w; param shape: (40000L, 300L)
-param name: conv1d.w; param shape: (1500L, 300L)
-param name: fc1.w; param shape: (300L, 128L)
-param name: fc1.b; param shape: (128L,)
-param name: fc_2.w_0; param shape: (256L, 128L)
-param name: fc_2.b_0; param shape: (128L,)
-param name: fc_3.w_0; param shape: (128L, 128L)
-param name: fc_3.b_0; param shape: (128L,)
-param name: fc_4.w_0; param shape: (128L, 2L)
-param name: fc_4.b_0; param shape: (2L,)
-loading pretrained word embedding to param
-[Wed Oct 10 16:33:18 2018] epoch_id: -1, dev_cost: 0.693804, accuracy: 0.5109
-[Wed Oct 10 16:33:18 2018] epoch_id: -1, test_cost: 0.693670, accuracy: 0.5096
-
-[Wed Oct 10 16:33:18 2018] Start Training
-[Wed Oct 10 16:33:27 2018] epoch_id: 0, batch_id: 0, cost: 0.699992, acc: 0.515625
-[Wed Oct 10 16:33:30 2018] epoch_id: 0, batch_id: 100, cost: 0.557354, acc: 0.695312
-[Wed Oct 10 16:33:33 2018] epoch_id: 0, batch_id: 200, cost: 0.548301, acc: 0.742188
-[Wed Oct 10 16:33:35 2018] epoch_id: 0, batch_id: 300, cost: 0.528907, acc: 0.742188
-[Wed Oct 10 16:33:39 2018] epoch_id: 0, batch_id: 400, cost: 0.482460, acc: 0.781250
-[Wed Oct 10 16:33:41 2018] epoch_id: 0, batch_id: 500, cost: 0.494885, acc: 0.718750
-[Wed Oct 10 16:33:44 2018] epoch_id: 0, batch_id: 600, cost: 0.600175, acc: 0.695312
-[Wed Oct 10 16:33:46 2018] epoch_id: 0, batch_id: 700, cost: 0.477964, acc: 0.757812
-[Wed Oct 10 16:33:49 2018] epoch_id: 0, batch_id: 800, cost: 0.468172, acc: 0.750000
-[Wed Oct 10 16:33:51 2018] epoch_id: 0, batch_id: 900, cost: 0.394047, acc: 0.835938
-[Wed Oct 10 16:33:54 2018] epoch_id: 0, batch_id: 1000, cost: 0.520142, acc: 0.734375
-[Wed Oct 10 16:33:56 2018] epoch_id: 0, batch_id: 1100, cost: 0.471779, acc: 0.757812
-[Wed Oct 10 16:33:59 2018] epoch_id: 0, batch_id: 1200, cost: 0.407287, acc: 0.789062
-[Wed Oct 10 16:34:01 2018] epoch_id: 0, batch_id: 1300, cost: 0.430800, acc: 0.812500
-[Wed Oct 10 16:34:03 2018] epoch_id: 0, batch_id: 1400, cost: 0.421967, acc: 0.796875
-[Wed Oct 10 16:34:06 2018] epoch_id: 0, batch_id: 1500, cost: 0.388925, acc: 0.835938
-[Wed Oct 10 16:34:08 2018] epoch_id: 0, batch_id: 1600, cost: 0.445022, acc: 0.796875
-[Wed Oct 10 16:34:10 2018] epoch_id: 0, batch_id: 1700, cost: 0.439095, acc: 0.796875
-[Wed Oct 10 16:34:13 2018] epoch_id: 0, batch_id: 1800, cost: 0.448246, acc: 0.765625
-[Wed Oct 10 16:34:15 2018] epoch_id: 0, batch_id: 1900, cost: 0.377162, acc: 0.789062
-[Wed Oct 10 16:34:17 2018] epoch_id: 0, batch_id: 2000, cost: 0.460397, acc: 0.820312
-[Wed Oct 10 16:34:20 2018] epoch_id: 0, batch_id: 2100, cost: 0.416145, acc: 0.812500
-[Wed Oct 10 16:34:22 2018] epoch_id: 0, batch_id: 2200, cost: 0.509166, acc: 0.710938
-[Wed Oct 10 16:34:24 2018] epoch_id: 0, batch_id: 2300, cost: 0.450925, acc: 0.765625
-[Wed Oct 10 16:34:26 2018] epoch_id: 0, batch_id: 2400, cost: 0.457177, acc: 0.796875
-[Wed Oct 10 16:34:29 2018] epoch_id: 0, batch_id: 2500, cost: 0.454368, acc: 0.851562
-[Wed Oct 10 16:34:31 2018] epoch_id: 0, batch_id: 2600, cost: 0.478799, acc: 0.750000
-[Wed Oct 10 16:34:34 2018] epoch_id: 0, batch_id: 2700, cost: 0.521526, acc: 0.757812
-[Wed Oct 10 16:34:36 2018] epoch_id: 0, batch_id: 2800, cost: 0.476336, acc: 0.789062
-[Wed Oct 10 16:34:38 2018] epoch_id: 0, batch_id: 2900, cost: 0.407489, acc: 0.812500
-[Wed Oct 10 16:34:41 2018] epoch_id: 0, batch_id: 3000, cost: 0.404804, acc: 0.820312
-
-[Wed Oct 10 16:34:42 2018] epoch_id: 0, train_avg_cost: 0.456508, train_avg_acc: 0.779733
-[Wed Oct 10 16:34:43 2018] epoch_id: 0, dev_cost: 0.469818, accuracy: 0.7691
-[Wed Oct 10 16:34:44 2018] epoch_id: 0, test_cost: 0.462696, accuracy: 0.7734
-
-[Wed Oct 10 16:34:53 2018] epoch_id: 1, batch_id: 0, cost: 0.381106, acc: 0.820312
-[Wed Oct 10 16:34:56 2018] epoch_id: 1, batch_id: 100, cost: 0.325008, acc: 0.859375
-[Wed Oct 10 16:34:58 2018] epoch_id: 1, batch_id: 200, cost: 0.318922, acc: 0.843750
-[Wed Oct 10 16:35:00 2018] epoch_id: 1, batch_id: 300, cost: 0.359727, acc: 0.804688
-[Wed Oct 10 16:35:03 2018] epoch_id: 1, batch_id: 400, cost: 0.308632, acc: 0.875000
-[Wed Oct 10 16:35:05 2018] epoch_id: 1, batch_id: 500, cost: 0.326841, acc: 0.851562
-[Wed Oct 10 16:35:09 2018] epoch_id: 1, batch_id: 600, cost: 0.398975, acc: 0.796875
-[Wed Oct 10 16:35:12 2018] epoch_id: 1, batch_id: 700, cost: 0.296837, acc: 0.867188
-[Wed Oct 10 16:35:14 2018] epoch_id: 1, batch_id: 800, cost: 0.289739, acc: 0.867188
-[Wed Oct 10 16:35:17 2018] epoch_id: 1, batch_id: 900, cost: 0.315425, acc: 0.835938
-[Wed Oct 10 16:35:19 2018] epoch_id: 1, batch_id: 1000, cost: 0.340806, acc: 0.828125
-[Wed Oct 10 16:35:22 2018] epoch_id: 1, batch_id: 1100, cost: 0.383585, acc: 0.828125
-[Wed Oct 10 16:35:24 2018] epoch_id: 1, batch_id: 1200, cost: 0.317520, acc: 0.843750
-[Wed Oct 10 16:35:26 2018] epoch_id: 1, batch_id: 1300, cost: 0.308717, acc: 0.875000
-[Wed Oct 10 16:35:29 2018] epoch_id: 1, batch_id: 1400, cost: 0.320688, acc: 0.828125
-[Wed Oct 10 16:35:31 2018] epoch_id: 1, batch_id: 1500, cost: 0.353638, acc: 0.812500
-[Wed Oct 10 16:35:34 2018] epoch_id: 1, batch_id: 1600, cost: 0.379113, acc: 0.804688
-[Wed Oct 10 16:35:36 2018] epoch_id: 1, batch_id: 1700, cost: 0.309887, acc: 0.859375
-[Wed Oct 10 16:35:38 2018] epoch_id: 1, batch_id: 1800, cost: 0.316372, acc: 0.859375
-[Wed Oct 10 16:35:41 2018] epoch_id: 1, batch_id: 1900, cost: 0.405585, acc: 0.804688
-[Wed Oct 10 16:35:43 2018] epoch_id: 1, batch_id: 2000, cost: 0.336917, acc: 0.851562
-[Wed Oct 10 16:35:45 2018] epoch_id: 1, batch_id: 2100, cost: 0.347034, acc: 0.835938
-[Wed Oct 10 16:35:48 2018] epoch_id: 1, batch_id: 2200, cost: 0.379728, acc: 0.835938
-[Wed Oct 10 16:35:50 2018] epoch_id: 1, batch_id: 2300, cost: 0.395257, acc: 0.820312
-[Wed Oct 10 16:35:53 2018] epoch_id: 1, batch_id: 2400, cost: 0.398583, acc: 0.812500
-[Wed Oct 10 16:35:55 2018] epoch_id: 1, batch_id: 2500, cost: 0.356259, acc: 0.859375
-[Wed Oct 10 16:35:57 2018] epoch_id: 1, batch_id: 2600, cost: 0.297765, acc: 0.835938
-[Wed Oct 10 16:35:59 2018] epoch_id: 1, batch_id: 2700, cost: 0.353899, acc: 0.835938
-[Wed Oct 10 16:36:02 2018] epoch_id: 1, batch_id: 2800, cost: 0.377699, acc: 0.820312
-[Wed Oct 10 16:36:04 2018] epoch_id: 1, batch_id: 2900, cost: 0.388959, acc: 0.804688
-[Wed Oct 10 16:36:06 2018] epoch_id: 1, batch_id: 3000, cost: 0.344840, acc: 0.835938
-
-[Wed Oct 10 16:36:07 2018] epoch_id: 1, train_avg_cost: 0.346376, train_avg_acc: 0.842572
-[Wed Oct 10 16:36:08 2018] epoch_id: 1, dev_cost: 0.402576, accuracy: 0.8094
-[Wed Oct 10 16:36:09 2018] epoch_id: 1, test_cost: 0.397121, accuracy: 0.8185
-
-[Wed Oct 10 16:36:18 2018] epoch_id: 2, batch_id: 0, cost: 0.280530, acc: 0.890625
-[Wed Oct 10 16:36:20 2018] epoch_id: 2, batch_id: 100, cost: 0.233576, acc: 0.906250
-[Wed Oct 10 16:36:22 2018] epoch_id: 2, batch_id: 200, cost: 0.245128, acc: 0.898438
-[Wed Oct 10 16:36:25 2018] epoch_id: 2, batch_id: 300, cost: 0.183943, acc: 0.906250
-[Wed Oct 10 16:36:27 2018] epoch_id: 2, batch_id: 400, cost: 0.270915, acc: 0.882812
-[Wed Oct 10 16:36:30 2018] epoch_id: 2, batch_id: 500, cost: 0.248726, acc: 0.906250
-[Wed Oct 10 16:36:32 2018] epoch_id: 2, batch_id: 600, cost: 0.243351, acc: 0.921875
-[Wed Oct 10 16:36:35 2018] epoch_id: 2, batch_id: 700, cost: 0.314026, acc: 0.812500
-[Wed Oct 10 16:36:38 2018] epoch_id: 2, batch_id: 800, cost: 0.336282, acc: 0.867188
-[Wed Oct 10 16:36:41 2018] epoch_id: 2, batch_id: 900, cost: 0.290222, acc: 0.875000
-[Wed Oct 10 16:36:43 2018] epoch_id: 2, batch_id: 1000, cost: 0.287339, acc: 0.859375
-[Wed Oct 10 16:36:45 2018] epoch_id: 2, batch_id: 1100, cost: 0.225436, acc: 0.890625
-[Wed Oct 10 16:36:48 2018] epoch_id: 2, batch_id: 1200, cost: 0.346974, acc: 0.859375
-[Wed Oct 10 16:36:50 2018] epoch_id: 2, batch_id: 1300, cost: 0.283542, acc: 0.843750
-[Wed Oct 10 16:36:53 2018] epoch_id: 2, batch_id: 1400, cost: 0.203151, acc: 0.921875
-[Wed Oct 10 16:36:55 2018] epoch_id: 2, batch_id: 1500, cost: 0.255483, acc: 0.906250
-[Wed Oct 10 16:36:58 2018] epoch_id: 2, batch_id: 1600, cost: 0.275010, acc: 0.898438
-[Wed Oct 10 16:37:00 2018] epoch_id: 2, batch_id: 1700, cost: 0.264693, acc: 0.867188
-[Wed Oct 10 16:37:03 2018] epoch_id: 2, batch_id: 1800, cost: 0.257360, acc: 0.890625
-[Wed Oct 10 16:37:05 2018] epoch_id: 2, batch_id: 1900, cost: 0.150528, acc: 0.921875
-[Wed Oct 10 16:37:08 2018] epoch_id: 2, batch_id: 2000, cost: 0.229797, acc: 0.906250
-[Wed Oct 10 16:37:11 2018] epoch_id: 2, batch_id: 2100, cost: 0.261790, acc: 0.867188
-[Wed Oct 10 16:37:14 2018] epoch_id: 2, batch_id: 2200, cost: 0.201237, acc: 0.914062
-[Wed Oct 10 16:37:16 2018] epoch_id: 2, batch_id: 2300, cost: 0.296701, acc: 0.875000
-[Wed Oct 10 16:37:19 2018] epoch_id: 2, batch_id: 2400, cost: 0.315291, acc: 0.875000
-[Wed Oct 10 16:37:21 2018] epoch_id: 2, batch_id: 2500, cost: 0.282715, acc: 0.843750
-[Wed Oct 10 16:37:24 2018] epoch_id: 2, batch_id: 2600, cost: 0.296843, acc: 0.843750
-[Wed Oct 10 16:37:26 2018] epoch_id: 2, batch_id: 2700, cost: 0.363040, acc: 0.843750
-[Wed Oct 10 16:37:29 2018] epoch_id: 2, batch_id: 2800, cost: 0.262465, acc: 0.867188
-[Wed Oct 10 16:37:31 2018] epoch_id: 2, batch_id: 2900, cost: 0.208009, acc: 0.906250
-[Wed Oct 10 16:37:34 2018] epoch_id: 2, batch_id: 3000, cost: 0.247068, acc: 0.867188
-
-[Wed Oct 10 16:37:34 2018] epoch_id: 2, train_avg_cost: 0.267260, train_avg_acc: 0.884560
-[Wed Oct 10 16:37:36 2018] epoch_id: 2, dev_cost: 0.434485, accuracy: 0.8153
-[Wed Oct 10 16:37:37 2018] epoch_id: 2, test_cost: 0.425083, accuracy: 0.8243
-
-[Wed Oct 10 16:37:46 2018] epoch_id: 3, batch_id: 0, cost: 0.130899, acc: 0.945312
-[Wed Oct 10 16:37:49 2018] epoch_id: 3, batch_id: 100, cost: 0.174115, acc: 0.914062
-[Wed Oct 10 16:37:52 2018] epoch_id: 3, batch_id: 200, cost: 0.162655, acc: 0.929688
-[Wed Oct 10 16:37:54 2018] epoch_id: 3, batch_id: 300, cost: 0.156763, acc: 0.937500
-[Wed Oct 10 16:37:56 2018] epoch_id: 3, batch_id: 400, cost: 0.171531, acc: 0.929688
-[Wed Oct 10 16:37:59 2018] epoch_id: 3, batch_id: 500, cost: 0.124120, acc: 0.937500
-[Wed Oct 10 16:38:02 2018] epoch_id: 3, batch_id: 600, cost: 0.172306, acc: 0.929688
-[Wed Oct 10 16:38:04 2018] epoch_id: 3, batch_id: 700, cost: 0.352722, acc: 0.867188
-[Wed Oct 10 16:38:06 2018] epoch_id: 3, batch_id: 800, cost: 0.179998, acc: 0.929688
-[Wed Oct 10 16:38:09 2018] epoch_id: 3, batch_id: 900, cost: 0.197941, acc: 0.921875
-[Wed Oct 10 16:38:11 2018] epoch_id: 3, batch_id: 1000, cost: 0.163592, acc: 0.937500
-[Wed Oct 10 16:38:14 2018] epoch_id: 3, batch_id: 1100, cost: 0.196162, acc: 0.882812
-[Wed Oct 10 16:38:16 2018] epoch_id: 3, batch_id: 1200, cost: 0.201064, acc: 0.929688
-[Wed Oct 10 16:38:19 2018] epoch_id: 3, batch_id: 1300, cost: 0.162742, acc: 0.921875
-[Wed Oct 10 16:38:21 2018] epoch_id: 3, batch_id: 1400, cost: 0.192062, acc: 0.890625
-[Wed Oct 10 16:38:23 2018] epoch_id: 3, batch_id: 1500, cost: 0.215189, acc: 0.914062
-[Wed Oct 10 16:38:26 2018] epoch_id: 3, batch_id: 1600, cost: 0.148390, acc: 0.945312
-[Wed Oct 10 16:38:28 2018] epoch_id: 3, batch_id: 1700, cost: 0.148536, acc: 0.937500
-[Wed Oct 10 16:38:32 2018] epoch_id: 3, batch_id: 1800, cost: 0.122290, acc: 0.960938
-[Wed Oct 10 16:38:34 2018] epoch_id: 3, batch_id: 1900, cost: 0.152864, acc: 0.945312
-[Wed Oct 10 16:38:37 2018] epoch_id: 3, batch_id: 2000, cost: 0.250165, acc: 0.914062
-[Wed Oct 10 16:38:39 2018] epoch_id: 3, batch_id: 2100, cost: 0.197931, acc: 0.929688
-[Wed Oct 10 16:38:42 2018] epoch_id: 3, batch_id: 2200, cost: 0.167291, acc: 0.937500
-[Wed Oct 10 16:38:44 2018] epoch_id: 3, batch_id: 2300, cost: 0.243269, acc: 0.898438
-[Wed Oct 10 16:38:47 2018] epoch_id: 3, batch_id: 2400, cost: 0.170633, acc: 0.921875
-[Wed Oct 10 16:38:49 2018] epoch_id: 3, batch_id: 2500, cost: 0.182344, acc: 0.921875
-[Wed Oct 10 16:38:52 2018] epoch_id: 3, batch_id: 2600, cost: 0.267497, acc: 0.921875
-[Wed Oct 10 16:38:54 2018] epoch_id: 3, batch_id: 2700, cost: 0.170150, acc: 0.929688
-[Wed Oct 10 16:38:56 2018] epoch_id: 3, batch_id: 2800, cost: 0.198175, acc: 0.890625
-[Wed Oct 10 16:38:59 2018] epoch_id: 3, batch_id: 2900, cost: 0.231687, acc: 0.898438
-[Wed Oct 10 16:39:01 2018] epoch_id: 3, batch_id: 3000, cost: 0.280869, acc: 0.882812
-
-[Wed Oct 10 16:39:02 2018] epoch_id: 3, train_avg_cost: 0.203352, train_avg_acc: 0.915808
-[Wed Oct 10 16:39:03 2018] epoch_id: 3, dev_cost: 0.413912, accuracy: 0.8304
-[Wed Oct 10 16:39:04 2018] epoch_id: 3, test_cost: 0.409365, accuracy: 0.8341
-
-[Wed Oct 10 16:39:13 2018] epoch_id: 4, batch_id: 0, cost: 0.208998, acc: 0.945312
-[Wed Oct 10 16:39:16 2018] epoch_id: 4, batch_id: 100, cost: 0.148128, acc: 0.929688
-[Wed Oct 10 16:39:18 2018] epoch_id: 4, batch_id: 200, cost: 0.079264, acc: 0.976562
-[Wed Oct 10 16:39:21 2018] epoch_id: 4, batch_id: 300, cost: 0.125277, acc: 0.937500
-[Wed Oct 10 16:39:23 2018] epoch_id: 4, batch_id: 400, cost: 0.105227, acc: 0.968750
-[Wed Oct 10 16:39:25 2018] epoch_id: 4, batch_id: 500, cost: 0.063737, acc: 0.984375
-[Wed Oct 10 16:39:28 2018] epoch_id: 4, batch_id: 600, cost: 0.148419, acc: 0.937500
-[Wed Oct 10 16:39:30 2018] epoch_id: 4, batch_id: 700, cost: 0.118386, acc: 0.937500
-[Wed Oct 10 16:39:33 2018] epoch_id: 4, batch_id: 800, cost: 0.236417, acc: 0.898438
-[Wed Oct 10 16:39:35 2018] epoch_id: 4, batch_id: 900, cost: 0.131614, acc: 0.945312
-[Wed Oct 10 16:39:38 2018] epoch_id: 4, batch_id: 1000, cost: 0.134897, acc: 0.953125
-[Wed Oct 10 16:39:40 2018] epoch_id: 4, batch_id: 1100, cost: 0.152974, acc: 0.945312
-[Wed Oct 10 16:39:43 2018] epoch_id: 4, batch_id: 1200, cost: 0.173617, acc: 0.937500
-[Wed Oct 10 16:39:45 2018] epoch_id: 4, batch_id: 1300, cost: 0.128535, acc: 0.937500
-[Wed Oct 10 16:39:48 2018] epoch_id: 4, batch_id: 1400, cost: 0.156204, acc: 0.945312
-[Wed Oct 10 16:39:50 2018] epoch_id: 4, batch_id: 1500, cost: 0.130960, acc: 0.937500
-[Wed Oct 10 16:39:53 2018] epoch_id: 4, batch_id: 1600, cost: 0.185379, acc: 0.914062
-[Wed Oct 10 16:39:55 2018] epoch_id: 4, batch_id: 1700, cost: 0.092890, acc: 0.960938
-[Wed Oct 10 16:39:58 2018] epoch_id: 4, batch_id: 1800, cost: 0.147196, acc: 0.929688
-[Wed Oct 10 16:40:00 2018] epoch_id: 4, batch_id: 1900, cost: 0.153621, acc: 0.953125
-[Wed Oct 10 16:40:03 2018] epoch_id: 4, batch_id: 2000, cost: 0.153048, acc: 0.921875
-[Wed Oct 10 16:40:05 2018] epoch_id: 4, batch_id: 2100, cost: 0.205303, acc: 0.898438
-[Wed Oct 10 16:40:07 2018] epoch_id: 4, batch_id: 2200, cost: 0.139906, acc: 0.960938
-[Wed Oct 10 16:40:10 2018] epoch_id: 4, batch_id: 2300, cost: 0.254768, acc: 0.890625
-[Wed Oct 10 16:40:12 2018] epoch_id: 4, batch_id: 2400, cost: 0.076761, acc: 0.968750
-[Wed Oct 10 16:40:14 2018] epoch_id: 4, batch_id: 2500, cost: 0.199733, acc: 0.906250
-[Wed Oct 10 16:40:16 2018] epoch_id: 4, batch_id: 2600, cost: 0.310914, acc: 0.882812
-[Wed Oct 10 16:40:19 2018] epoch_id: 4, batch_id: 2700, cost: 0.148558, acc: 0.921875
-[Wed Oct 10 16:40:21 2018] epoch_id: 4, batch_id: 2800, cost: 0.164562, acc: 0.921875
-[Wed Oct 10 16:40:23 2018] epoch_id: 4, batch_id: 2900, cost: 0.177139, acc: 0.921875
-[Wed Oct 10 16:40:26 2018] epoch_id: 4, batch_id: 3000, cost: 0.112299, acc: 0.968750
-
-[Wed Oct 10 16:40:27 2018] epoch_id: 4, train_avg_cost: 0.156220, train_avg_acc: 0.937780
-[Wed Oct 10 16:40:28 2018] epoch_id: 4, dev_cost: 0.468851, accuracy: 0.8348
-[Wed Oct 10 16:40:29 2018] epoch_id: 4, test_cost: 0.468213, accuracy: 0.8368
-
-[Wed Oct 10 16:40:38 2018] epoch_id: 5, batch_id: 0, cost: 0.084071, acc: 0.976562
-[Wed Oct 10 16:40:41 2018] epoch_id: 5, batch_id: 100, cost: 0.052093, acc: 0.968750
-[Wed Oct 10 16:40:43 2018] epoch_id: 5, batch_id: 200, cost: 0.193576, acc: 0.929688
-[Wed Oct 10 16:40:46 2018] epoch_id: 5, batch_id: 300, cost: 0.075502, acc: 0.968750
-[Wed Oct 10 16:40:48 2018] epoch_id: 5, batch_id: 400, cost: 0.079619, acc: 0.976562
-[Wed Oct 10 16:40:51 2018] epoch_id: 5, batch_id: 500, cost: 0.124719, acc: 0.945312
-[Wed Oct 10 16:40:53 2018] epoch_id: 5, batch_id: 600, cost: 0.157322, acc: 0.929688
-[Wed Oct 10 16:40:56 2018] epoch_id: 5, batch_id: 700, cost: 0.100680, acc: 0.945312
-[Wed Oct 10 16:40:58 2018] epoch_id: 5, batch_id: 800, cost: 0.164627, acc: 0.937500
-[Wed Oct 10 16:41:00 2018] epoch_id: 5, batch_id: 900, cost: 0.113826, acc: 0.960938
-[Wed Oct 10 16:41:03 2018] epoch_id: 5, batch_id: 1000, cost: 0.122406, acc: 0.953125
-[Wed Oct 10 16:41:05 2018] epoch_id: 5, batch_id: 1100, cost: 0.098428, acc: 0.960938
-[Wed Oct 10 16:41:08 2018] epoch_id: 5, batch_id: 1200, cost: 0.175987, acc: 0.914062
-[Wed Oct 10 16:41:10 2018] epoch_id: 5, batch_id: 1300, cost: 0.161037, acc: 0.929688
-[Wed Oct 10 16:41:12 2018] epoch_id: 5, batch_id: 1400, cost: 0.058083, acc: 0.976562
-[Wed Oct 10 16:41:14 2018] epoch_id: 5, batch_id: 1500, cost: 0.099512, acc: 0.953125
-[Wed Oct 10 16:41:17 2018] epoch_id: 5, batch_id: 1600, cost: 0.155458, acc: 0.929688
-[Wed Oct 10 16:41:19 2018] epoch_id: 5, batch_id: 1700, cost: 0.149099, acc: 0.953125
-[Wed Oct 10 16:41:21 2018] epoch_id: 5, batch_id: 1800, cost: 0.184663, acc: 0.945312
-[Wed Oct 10 16:41:24 2018] epoch_id: 5, batch_id: 1900, cost: 0.153789, acc: 0.945312
-[Wed Oct 10 16:41:26 2018] epoch_id: 5, batch_id: 2000, cost: 0.135054, acc: 0.945312
-[Wed Oct 10 16:41:28 2018] epoch_id: 5, batch_id: 2100, cost: 0.091075, acc: 0.960938
-[Wed Oct 10 16:41:30 2018] epoch_id: 5, batch_id: 2200, cost: 0.175665, acc: 0.937500
-[Wed Oct 10 16:41:33 2018] epoch_id: 5, batch_id: 2300, cost: 0.092569, acc: 0.976562
-[Wed Oct 10 16:41:35 2018] epoch_id: 5, batch_id: 2400, cost: 0.171366, acc: 0.929688
-[Wed Oct 10 16:41:37 2018] epoch_id: 5, batch_id: 2500, cost: 0.077127, acc: 0.984375
-[Wed Oct 10 16:41:39 2018] epoch_id: 5, batch_id: 2600, cost: 0.133260, acc: 0.960938
-[Wed Oct 10 16:41:43 2018] epoch_id: 5, batch_id: 2700, cost: 0.130742, acc: 0.953125
-[Wed Oct 10 16:41:45 2018] epoch_id: 5, batch_id: 2800, cost: 0.165412, acc: 0.945312
-[Wed Oct 10 16:41:48 2018] epoch_id: 5, batch_id: 2900, cost: 0.099631, acc: 0.953125
-[Wed Oct 10 16:41:50 2018] epoch_id: 5, batch_id: 3000, cost: 0.191953, acc: 0.929688
-
-[Wed Oct 10 16:41:51 2018] epoch_id: 5, train_avg_cost: 0.122534, train_avg_acc: 0.952647
-[Wed Oct 10 16:41:52 2018] epoch_id: 5, dev_cost: 0.517809, accuracy: 0.8338
-[Wed Oct 10 16:41:53 2018] epoch_id: 5, test_cost: 0.516574, accuracy: 0.8379
-
-[Wed Oct 10 16:42:02 2018] epoch_id: 6, batch_id: 0, cost: 0.108672, acc: 0.953125
-[Wed Oct 10 16:42:04 2018] epoch_id: 6, batch_id: 100, cost: 0.055064, acc: 0.984375
-[Wed Oct 10 16:42:07 2018] epoch_id: 6, batch_id: 200, cost: 0.070521, acc: 0.976562
-[Wed Oct 10 16:42:09 2018] epoch_id: 6, batch_id: 300, cost: 0.044554, acc: 0.992188
-[Wed Oct 10 16:42:12 2018] epoch_id: 6, batch_id: 400, cost: 0.140199, acc: 0.968750
-[Wed Oct 10 16:42:14 2018] epoch_id: 6, batch_id: 500, cost: 0.074043, acc: 0.984375
-[Wed Oct 10 16:42:17 2018] epoch_id: 6, batch_id: 600, cost: 0.072380, acc: 0.960938
-[Wed Oct 10 16:42:19 2018] epoch_id: 6, batch_id: 700, cost: 0.089520, acc: 0.968750
-[Wed Oct 10 16:42:21 2018] epoch_id: 6, batch_id: 800, cost: 0.154753, acc: 0.937500
-[Wed Oct 10 16:42:24 2018] epoch_id: 6, batch_id: 900, cost: 0.137237, acc: 0.945312
-[Wed Oct 10 16:42:26 2018] epoch_id: 6, batch_id: 1000, cost: 0.155418, acc: 0.953125
-[Wed Oct 10 16:42:28 2018] epoch_id: 6, batch_id: 1100, cost: 0.102754, acc: 0.968750
-[Wed Oct 10 16:42:31 2018] epoch_id: 6, batch_id: 1200, cost: 0.171521, acc: 0.929688
-[Wed Oct 10 16:42:33 2018] epoch_id: 6, batch_id: 1300, cost: 0.089853, acc: 0.984375
-[Wed Oct 10 16:42:36 2018] epoch_id: 6, batch_id: 1400, cost: 0.117480, acc: 0.953125
-[Wed Oct 10 16:42:38 2018] epoch_id: 6, batch_id: 1500, cost: 0.144428, acc: 0.953125
-[Wed Oct 10 16:42:40 2018] epoch_id: 6, batch_id: 1600, cost: 0.100815, acc: 0.945312
-[Wed Oct 10 16:42:43 2018] epoch_id: 6, batch_id: 1700, cost: 0.096131, acc: 0.960938
-[Wed Oct 10 16:42:45 2018] epoch_id: 6, batch_id: 1800, cost: 0.083034, acc: 0.968750
-[Wed Oct 10 16:42:47 2018] epoch_id: 6, batch_id: 1900, cost: 0.144603, acc: 0.937500
-[Wed Oct 10 16:42:50 2018] epoch_id: 6, batch_id: 2000, cost: 0.125068, acc: 0.960938
-[Wed Oct 10 16:42:52 2018] epoch_id: 6, batch_id: 2100, cost: 0.096932, acc: 0.945312
-[Wed Oct 10 16:42:54 2018] epoch_id: 6, batch_id: 2200, cost: 0.187626, acc: 0.906250
-[Wed Oct 10 16:42:58 2018] epoch_id: 6, batch_id: 2300, cost: 0.086040, acc: 0.953125
-[Wed Oct 10 16:43:00 2018] epoch_id: 6, batch_id: 2400, cost: 0.112231, acc: 0.960938
-[Wed Oct 10 16:43:03 2018] epoch_id: 6, batch_id: 2500, cost: 0.086397, acc: 0.976562
-[Wed Oct 10 16:43:05 2018] epoch_id: 6, batch_id: 2600, cost: 0.093871, acc: 0.960938
-[Wed Oct 10 16:43:07 2018] epoch_id: 6, batch_id: 2700, cost: 0.143658, acc: 0.953125
-[Wed Oct 10 16:43:10 2018] epoch_id: 6, batch_id: 2800, cost: 0.144744, acc: 0.945312
-[Wed Oct 10 16:43:12 2018] epoch_id: 6, batch_id: 2900, cost: 0.127995, acc: 0.945312
-[Wed Oct 10 16:43:14 2018] epoch_id: 6, batch_id: 3000, cost: 0.201635, acc: 0.929688
-
-[Wed Oct 10 16:43:15 2018] epoch_id: 6, train_avg_cost: 0.100383, train_avg_acc: 0.961683
-[Wed Oct 10 16:43:16 2018] epoch_id: 6, dev_cost: 0.622004, accuracy: 0.833
-[Wed Oct 10 16:43:17 2018] epoch_id: 6, test_cost: 0.604546, accuracy: 0.836
-
-[Wed Oct 10 16:43:25 2018] epoch_id: 7, batch_id: 0, cost: 0.092909, acc: 0.968750
-[Wed Oct 10 16:43:28 2018] epoch_id: 7, batch_id: 100, cost: 0.048849, acc: 0.976562
-[Wed Oct 10 16:43:31 2018] epoch_id: 7, batch_id: 200, cost: 0.123149, acc: 0.960938
-[Wed Oct 10 16:43:33 2018] epoch_id: 7, batch_id: 300, cost: 0.043434, acc: 0.992188
-[Wed Oct 10 16:43:35 2018] epoch_id: 7, batch_id: 400, cost: 0.057082, acc: 0.976562
-[Wed Oct 10 16:43:38 2018] epoch_id: 7, batch_id: 500, cost: 0.043290, acc: 0.976562
-[Wed Oct 10 16:43:40 2018] epoch_id: 7, batch_id: 600, cost: 0.061600, acc: 0.976562
-[Wed Oct 10 16:43:42 2018] epoch_id: 7, batch_id: 700, cost: 0.077328, acc: 0.968750
-[Wed Oct 10 16:43:45 2018] epoch_id: 7, batch_id: 800, cost: 0.139978, acc: 0.953125
-[Wed Oct 10 16:43:48 2018] epoch_id: 7, batch_id: 900, cost: 0.099730, acc: 0.960938
-[Wed Oct 10 16:43:51 2018] epoch_id: 7, batch_id: 1000, cost: 0.072699, acc: 0.976562
-[Wed Oct 10 16:43:53 2018] epoch_id: 7, batch_id: 1100, cost: 0.031092, acc: 0.992188
-[Wed Oct 10 16:43:55 2018] epoch_id: 7, batch_id: 1200, cost: 0.118547, acc: 0.960938
-[Wed Oct 10 16:43:57 2018] epoch_id: 7, batch_id: 1300, cost: 0.061420, acc: 0.976562
-[Wed Oct 10 16:44:00 2018] epoch_id: 7, batch_id: 1400, cost: 0.096040, acc: 0.968750
-[Wed Oct 10 16:44:02 2018] epoch_id: 7, batch_id: 1500, cost: 0.052711, acc: 0.992188
-[Wed Oct 10 16:44:04 2018] epoch_id: 7, batch_id: 1600, cost: 0.150460, acc: 0.929688
-[Wed Oct 10 16:44:07 2018] epoch_id: 7, batch_id: 1700, cost: 0.097628, acc: 0.976562
-[Wed Oct 10 16:44:09 2018] epoch_id: 7, batch_id: 1800, cost: 0.081382, acc: 0.976562
-[Wed Oct 10 16:44:11 2018] epoch_id: 7, batch_id: 1900, cost: 0.089064, acc: 0.953125
-[Wed Oct 10 16:44:14 2018] epoch_id: 7, batch_id: 2000, cost: 0.084270, acc: 0.968750
-[Wed Oct 10 16:44:16 2018] epoch_id: 7, batch_id: 2100, cost: 0.097173, acc: 0.968750
-[Wed Oct 10 16:44:18 2018] epoch_id: 7, batch_id: 2200, cost: 0.112953, acc: 0.960938
-[Wed Oct 10 16:44:20 2018] epoch_id: 7, batch_id: 2300, cost: 0.116143, acc: 0.953125
-[Wed Oct 10 16:44:23 2018] epoch_id: 7, batch_id: 2400, cost: 0.098675, acc: 0.968750
-[Wed Oct 10 16:44:25 2018] epoch_id: 7, batch_id: 2500, cost: 0.150993, acc: 0.945312
-[Wed Oct 10 16:44:27 2018] epoch_id: 7, batch_id: 2600, cost: 0.076421, acc: 0.968750
-[Wed Oct 10 16:44:29 2018] epoch_id: 7, batch_id: 2700, cost: 0.088665, acc: 0.968750
-[Wed Oct 10 16:44:32 2018] epoch_id: 7, batch_id: 2800, cost: 0.142891, acc: 0.937500
-[Wed Oct 10 16:44:34 2018] epoch_id: 7, batch_id: 2900, cost: 0.088820, acc: 0.968750
-[Wed Oct 10 16:44:36 2018] epoch_id: 7, batch_id: 3000, cost: 0.100579, acc: 0.968750
-
-[Wed Oct 10 16:44:37 2018] epoch_id: 7, train_avg_cost: 0.084162, train_avg_acc: 0.968487
-[Wed Oct 10 16:44:38 2018] epoch_id: 7, dev_cost: 0.655423, accuracy: 0.8369
-[Wed Oct 10 16:44:39 2018] epoch_id: 7, test_cost: 0.663061, accuracy: 0.8352
-
-[Wed Oct 10 16:44:47 2018] epoch_id: 8, batch_id: 0, cost: 0.037309, acc: 0.992188
-[Wed Oct 10 16:44:50 2018] epoch_id: 8, batch_id: 100, cost: 0.043888, acc: 0.976562
-[Wed Oct 10 16:44:52 2018] epoch_id: 8, batch_id: 200, cost: 0.099702, acc: 0.960938
-[Wed Oct 10 16:44:54 2018] epoch_id: 8, batch_id: 300, cost: 0.080207, acc: 0.976562
-[Wed Oct 10 16:44:56 2018] epoch_id: 8, batch_id: 400, cost: 0.049319, acc: 0.976562
-[Wed Oct 10 16:44:59 2018] epoch_id: 8, batch_id: 500, cost: 0.041202, acc: 0.976562
-[Wed Oct 10 16:45:01 2018] epoch_id: 8, batch_id: 600, cost: 0.061663, acc: 0.968750
-[Wed Oct 10 16:45:03 2018] epoch_id: 8, batch_id: 700, cost: 0.065126, acc: 0.984375
-[Wed Oct 10 16:45:05 2018] epoch_id: 8, batch_id: 800, cost: 0.057770, acc: 0.976562
-[Wed Oct 10 16:45:07 2018] epoch_id: 8, batch_id: 900, cost: 0.136513, acc: 0.929688
-[Wed Oct 10 16:45:10 2018] epoch_id: 8, batch_id: 1000, cost: 0.054884, acc: 0.968750
-[Wed Oct 10 16:45:12 2018] epoch_id: 8, batch_id: 1100, cost: 0.046854, acc: 0.992188
-[Wed Oct 10 16:45:14 2018] epoch_id: 8, batch_id: 1200, cost: 0.031739, acc: 1.000000
-[Wed Oct 10 16:45:17 2018] epoch_id: 8, batch_id: 1300, cost: 0.127405, acc: 0.953125
-[Wed Oct 10 16:45:19 2018] epoch_id: 8, batch_id: 1400, cost: 0.052842, acc: 0.976562
-[Wed Oct 10 16:45:21 2018] epoch_id: 8, batch_id: 1500, cost: 0.117588, acc: 0.960938
-[Wed Oct 10 16:45:23 2018] epoch_id: 8, batch_id: 1600, cost: 0.078688, acc: 0.968750
-[Wed Oct 10 16:45:26 2018] epoch_id: 8, batch_id: 1700, cost: 0.069420, acc: 0.976562
-[Wed Oct 10 16:45:28 2018] epoch_id: 8, batch_id: 1800, cost: 0.055502, acc: 0.976562
-[Wed Oct 10 16:45:31 2018] epoch_id: 8, batch_id: 1900, cost: 0.161759, acc: 0.945312
-[Wed Oct 10 16:45:34 2018] epoch_id: 8, batch_id: 2000, cost: 0.063610, acc: 0.984375
-[Wed Oct 10 16:45:36 2018] epoch_id: 8, batch_id: 2100, cost: 0.103227, acc: 0.937500
-[Wed Oct 10 16:45:38 2018] epoch_id: 8, batch_id: 2200, cost: 0.065949, acc: 0.976562
-[Wed Oct 10 16:45:40 2018] epoch_id: 8, batch_id: 2300, cost: 0.060299, acc: 0.968750
-[Wed Oct 10 16:45:43 2018] epoch_id: 8, batch_id: 2400, cost: 0.089557, acc: 0.976562
-[Wed Oct 10 16:45:45 2018] epoch_id: 8, batch_id: 2500, cost: 0.095753, acc: 0.968750
-[Wed Oct 10 16:45:47 2018] epoch_id: 8, batch_id: 2600, cost: 0.111113, acc: 0.968750
-[Wed Oct 10 16:45:49 2018] epoch_id: 8, batch_id: 2700, cost: 0.074921, acc: 0.960938
-[Wed Oct 10 16:45:52 2018] epoch_id: 8, batch_id: 2800, cost: 0.105058, acc: 0.945312
-[Wed Oct 10 16:45:54 2018] epoch_id: 8, batch_id: 2900, cost: 0.173304, acc: 0.921875
-[Wed Oct 10 16:45:56 2018] epoch_id: 8, batch_id: 3000, cost: 0.077586, acc: 0.984375
-
-[Wed Oct 10 16:45:56 2018] epoch_id: 8, train_avg_cost: 0.072280, train_avg_acc: 0.973521
-[Wed Oct 10 16:45:57 2018] epoch_id: 8, dev_cost: 0.629243, accuracy: 0.8373
-[Wed Oct 10 16:45:58 2018] epoch_id: 8, test_cost: 0.661630, accuracy: 0.8352
-
-[Wed Oct 10 16:46:07 2018] epoch_id: 9, batch_id: 0, cost: 0.044024, acc: 0.984375
-[Wed Oct 10 16:46:09 2018] epoch_id: 9, batch_id: 100, cost: 0.033798, acc: 0.992188
-[Wed Oct 10 16:46:11 2018] epoch_id: 9, batch_id: 200, cost: 0.077856, acc: 0.976562
-[Wed Oct 10 16:46:14 2018] epoch_id: 9, batch_id: 300, cost: 0.119995, acc: 0.953125
-[Wed Oct 10 16:46:16 2018] epoch_id: 9, batch_id: 400, cost: 0.006741, acc: 1.000000
-[Wed Oct 10 16:46:18 2018] epoch_id: 9, batch_id: 500, cost: 0.097501, acc: 0.953125
-[Wed Oct 10 16:46:20 2018] epoch_id: 9, batch_id: 600, cost: 0.097540, acc: 0.960938
-[Wed Oct 10 16:46:22 2018] epoch_id: 9, batch_id: 700, cost: 0.085677, acc: 0.976562
-[Wed Oct 10 16:46:25 2018] epoch_id: 9, batch_id: 800, cost: 0.131135, acc: 0.960938
-[Wed Oct 10 16:46:27 2018] epoch_id: 9, batch_id: 900, cost: 0.058706, acc: 0.960938
-[Wed Oct 10 16:46:29 2018] epoch_id: 9, batch_id: 1000, cost: 0.081857, acc: 0.968750
-[Wed Oct 10 16:46:31 2018] epoch_id: 9, batch_id: 1100, cost: 0.035656, acc: 0.992188
-[Wed Oct 10 16:46:34 2018] epoch_id: 9, batch_id: 1200, cost: 0.023980, acc: 0.992188
-[Wed Oct 10 16:46:36 2018] epoch_id: 9, batch_id: 1300, cost: 0.104535, acc: 0.976562
-[Wed Oct 10 16:46:38 2018] epoch_id: 9, batch_id: 1400, cost: 0.052738, acc: 0.960938
-[Wed Oct 10 16:46:40 2018] epoch_id: 9, batch_id: 1500, cost: 0.049284, acc: 0.984375
-[Wed Oct 10 16:46:43 2018] epoch_id: 9, batch_id: 1600, cost: 0.040960, acc: 0.976562
-[Wed Oct 10 16:46:45 2018] epoch_id: 9, batch_id: 1700, cost: 0.054090, acc: 0.976562
-[Wed Oct 10 16:46:47 2018] epoch_id: 9, batch_id: 1800, cost: 0.030307, acc: 0.992188
-[Wed Oct 10 16:46:49 2018] epoch_id: 9, batch_id: 1900, cost: 0.152908, acc: 0.960938
-[Wed Oct 10 16:46:52 2018] epoch_id: 9, batch_id: 2000, cost: 0.133532, acc: 0.945312
-[Wed Oct 10 16:46:54 2018] epoch_id: 9, batch_id: 2100, cost: 0.162579, acc: 0.929688
-[Wed Oct 10 16:46:56 2018] epoch_id: 9, batch_id: 2200, cost: 0.037171, acc: 0.984375
-[Wed Oct 10 16:46:58 2018] epoch_id: 9, batch_id: 2300, cost: 0.036093, acc: 0.992188
-[Wed Oct 10 16:47:00 2018] epoch_id: 9, batch_id: 2400, cost: 0.066371, acc: 0.976562
-[Wed Oct 10 16:47:02 2018] epoch_id: 9, batch_id: 2500, cost: 0.047459, acc: 0.984375
-[Wed Oct 10 16:47:04 2018] epoch_id: 9, batch_id: 2600, cost: 0.031411, acc: 0.992188
-[Wed Oct 10 16:47:06 2018] epoch_id: 9, batch_id: 2700, cost: 0.107300, acc: 0.953125
-[Wed Oct 10 16:47:09 2018] epoch_id: 9, batch_id: 2800, cost: 0.041434, acc: 0.984375
-[Wed Oct 10 16:47:11 2018] epoch_id: 9, batch_id: 2900, cost: 0.081185, acc: 0.960938
-[Wed Oct 10 16:47:13 2018] epoch_id: 9, batch_id: 3000, cost: 0.096274, acc: 0.960938
-
-[Wed Oct 10 16:47:13 2018] epoch_id: 9, train_avg_cost: 0.063124, train_avg_acc: 0.976961
-[Wed Oct 10 16:47:14 2018] epoch_id: 9, dev_cost: 0.678009, accuracy: 0.8403
-[Wed Oct 10 16:47:15 2018] epoch_id: 9, test_cost: 0.707977, accuracy: 0.8359
-
-[Wed Oct 10 16:47:24 2018] epoch_id: 10, batch_id: 0, cost: 0.053481, acc: 0.968750
-[Wed Oct 10 16:47:26 2018] epoch_id: 10, batch_id: 100, cost: 0.024990, acc: 0.984375
-[Wed Oct 10 16:47:29 2018] epoch_id: 10, batch_id: 200, cost: 0.025989, acc: 0.992188
-[Wed Oct 10 16:47:31 2018] epoch_id: 10, batch_id: 300, cost: 0.016467, acc: 0.992188
-[Wed Oct 10 16:47:33 2018] epoch_id: 10, batch_id: 400, cost: 0.013582, acc: 1.000000
-[Wed Oct 10 16:47:35 2018] epoch_id: 10, batch_id: 500, cost: 0.062821, acc: 0.984375
-[Wed Oct 10 16:47:38 2018] epoch_id: 10, batch_id: 600, cost: 0.018919, acc: 0.992188
-[Wed Oct 10 16:47:40 2018] epoch_id: 10, batch_id: 700, cost: 0.113543, acc: 0.937500
-[Wed Oct 10 16:47:43 2018] epoch_id: 10, batch_id: 800, cost: 0.042273, acc: 0.984375
-[Wed Oct 10 16:47:45 2018] epoch_id: 10, batch_id: 900, cost: 0.040787, acc: 0.976562
-[Wed Oct 10 16:47:47 2018] epoch_id: 10, batch_id: 1000, cost: 0.013215, acc: 1.000000
-[Wed Oct 10 16:47:50 2018] epoch_id: 10, batch_id: 1100, cost: 0.056862, acc: 0.984375
-[Wed Oct 10 16:47:52 2018] epoch_id: 10, batch_id: 1200, cost: 0.114343, acc: 0.960938
-[Wed Oct 10 16:47:54 2018] epoch_id: 10, batch_id: 1300, cost: 0.068139, acc: 0.968750
-[Wed Oct 10 16:47:57 2018] epoch_id: 10, batch_id: 1400, cost: 0.036262, acc: 0.984375
-[Wed Oct 10 16:47:59 2018] epoch_id: 10, batch_id: 1500, cost: 0.031832, acc: 0.984375
-[Wed Oct 10 16:48:01 2018] epoch_id: 10, batch_id: 1600, cost: 0.098699, acc: 0.953125
-[Wed Oct 10 16:48:03 2018] epoch_id: 10, batch_id: 1700, cost: 0.073122, acc: 0.976562
-[Wed Oct 10 16:48:06 2018] epoch_id: 10, batch_id: 1800, cost: 0.035890, acc: 0.984375
-[Wed Oct 10 16:48:08 2018] epoch_id: 10, batch_id: 1900, cost: 0.036370, acc: 0.968750
-[Wed Oct 10 16:48:10 2018] epoch_id: 10, batch_id: 2000, cost: 0.073071, acc: 0.976562
-[Wed Oct 10 16:48:12 2018] epoch_id: 10, batch_id: 2100, cost: 0.017344, acc: 1.000000
-[Wed Oct 10 16:48:15 2018] epoch_id: 10, batch_id: 2200, cost: 0.146855, acc: 0.953125
-[Wed Oct 10 16:48:17 2018] epoch_id: 10, batch_id: 2300, cost: 0.068342, acc: 0.968750
-[Wed Oct 10 16:48:19 2018] epoch_id: 10, batch_id: 2400, cost: 0.026733, acc: 0.992188
-[Wed Oct 10 16:48:21 2018] epoch_id: 10, batch_id: 2500, cost: 0.085184, acc: 0.976562
-[Wed Oct 10 16:48:23 2018] epoch_id: 10, batch_id: 2600, cost: 0.065530, acc: 0.984375
-[Wed Oct 10 16:48:26 2018] epoch_id: 10, batch_id: 2700, cost: 0.111871, acc: 0.968750
-[Wed Oct 10 16:48:29 2018] epoch_id: 10, batch_id: 2800, cost: 0.063721, acc: 0.968750
-[Wed Oct 10 16:48:31 2018] epoch_id: 10, batch_id: 2900, cost: 0.026759, acc: 0.992188
-[Wed Oct 10 16:48:34 2018] epoch_id: 10, batch_id: 3000, cost: 0.031338, acc: 0.992188
-
-[Wed Oct 10 16:48:34 2018] epoch_id: 10, train_avg_cost: 0.055555, train_avg_acc: 0.979852
-[Wed Oct 10 16:48:35 2018] epoch_id: 10, dev_cost: 0.782007, accuracy: 0.8366
-[Wed Oct 10 16:48:36 2018] epoch_id: 10, test_cost: 0.795087, accuracy: 0.8369
-
-[Wed Oct 10 16:48:44 2018] epoch_id: 11, batch_id: 0, cost: 0.032797, acc: 0.992188
-[Wed Oct 10 16:48:47 2018] epoch_id: 11, batch_id: 100, cost: 0.011773, acc: 0.992188
-[Wed Oct 10 16:48:49 2018] epoch_id: 11, batch_id: 200, cost: 0.012297, acc: 1.000000
-[Wed Oct 10 16:48:51 2018] epoch_id: 11, batch_id: 300, cost: 0.032454, acc: 0.992188
-[Wed Oct 10 16:48:53 2018] epoch_id: 11, batch_id: 400, cost: 0.100247, acc: 0.976562
-[Wed Oct 10 16:48:55 2018] epoch_id: 11, batch_id: 500, cost: 0.035470, acc: 0.992188
-[Wed Oct 10 16:48:58 2018] epoch_id: 11, batch_id: 600, cost: 0.032553, acc: 0.984375
-[Wed Oct 10 16:49:00 2018] epoch_id: 11, batch_id: 700, cost: 0.035226, acc: 0.984375
-[Wed Oct 10 16:49:02 2018] epoch_id: 11, batch_id: 800, cost: 0.010961, acc: 1.000000
-[Wed Oct 10 16:49:04 2018] epoch_id: 11, batch_id: 900, cost: 0.033747, acc: 0.984375
-[Wed Oct 10 16:49:07 2018] epoch_id: 11, batch_id: 1000, cost: 0.052710, acc: 0.976562
-[Wed Oct 10 16:49:09 2018] epoch_id: 11, batch_id: 1100, cost: 0.021664, acc: 0.992188
-[Wed Oct 10 16:49:11 2018] epoch_id: 11, batch_id: 1200, cost: 0.056635, acc: 0.984375
-[Wed Oct 10 16:49:14 2018] epoch_id: 11, batch_id: 1300, cost: 0.007764, acc: 1.000000
-[Wed Oct 10 16:49:16 2018] epoch_id: 11, batch_id: 1400, cost: 0.042336, acc: 0.976562
-[Wed Oct 10 16:49:18 2018] epoch_id: 11, batch_id: 1500, cost: 0.077117, acc: 0.976562
-[Wed Oct 10 16:49:20 2018] epoch_id: 11, batch_id: 1600, cost: 0.082522, acc: 0.976562
-[Wed Oct 10 16:49:22 2018] epoch_id: 11, batch_id: 1700, cost: 0.022290, acc: 1.000000
-[Wed Oct 10 16:49:25 2018] epoch_id: 11, batch_id: 1800, cost: 0.033992, acc: 0.984375
-[Wed Oct 10 16:49:27 2018] epoch_id: 11, batch_id: 1900, cost: 0.027460, acc: 0.992188
-[Wed Oct 10 16:49:29 2018] epoch_id: 11, batch_id: 2000, cost: 0.032003, acc: 0.992188
-[Wed Oct 10 16:49:31 2018] epoch_id: 11, batch_id: 2100, cost: 0.070170, acc: 0.976562
-[Wed Oct 10 16:49:33 2018] epoch_id: 11, batch_id: 2200, cost: 0.017124, acc: 0.992188
-[Wed Oct 10 16:49:36 2018] epoch_id: 11, batch_id: 2300, cost: 0.037207, acc: 0.984375
-[Wed Oct 10 16:49:39 2018] epoch_id: 11, batch_id: 2400, cost: 0.018202, acc: 1.000000
-[Wed Oct 10 16:49:41 2018] epoch_id: 11, batch_id: 2500, cost: 0.059570, acc: 0.976562
-[Wed Oct 10 16:49:43 2018] epoch_id: 11, batch_id: 2600, cost: 0.009950, acc: 1.000000
-[Wed Oct 10 16:49:46 2018] epoch_id: 11, batch_id: 2700, cost: 0.015869, acc: 1.000000
-[Wed Oct 10 16:49:48 2018] epoch_id: 11, batch_id: 2800, cost: 0.049429, acc: 0.984375
-[Wed Oct 10 16:49:50 2018] epoch_id: 11, batch_id: 2900, cost: 0.061248, acc: 0.976562
-[Wed Oct 10 16:49:52 2018] epoch_id: 11, batch_id: 3000, cost: 0.007281, acc: 1.000000
-
-[Wed Oct 10 16:49:53 2018] epoch_id: 11, train_avg_cost: 0.049100, train_avg_acc: 0.982414
-[Wed Oct 10 16:49:54 2018] epoch_id: 11, dev_cost: 0.919803, accuracy: 0.8392
-[Wed Oct 10 16:49:55 2018] epoch_id: 11, test_cost: 0.963836, accuracy: 0.8354
-
-[Wed Oct 10 16:50:03 2018] epoch_id: 12, batch_id: 0, cost: 0.021594, acc: 0.992188
-[Wed Oct 10 16:50:05 2018] epoch_id: 12, batch_id: 100, cost: 0.003167, acc: 1.000000
-[Wed Oct 10 16:50:08 2018] epoch_id: 12, batch_id: 200, cost: 0.034331, acc: 0.984375
-[Wed Oct 10 16:50:10 2018] epoch_id: 12, batch_id: 300, cost: 0.044300, acc: 0.984375
-[Wed Oct 10 16:50:12 2018] epoch_id: 12, batch_id: 400, cost: 0.010300, acc: 1.000000
-[Wed Oct 10 16:50:15 2018] epoch_id: 12, batch_id: 500, cost: 0.071121, acc: 0.968750
-[Wed Oct 10 16:50:17 2018] epoch_id: 12, batch_id: 600, cost: 0.027463, acc: 0.984375
-[Wed Oct 10 16:50:19 2018] epoch_id: 12, batch_id: 700, cost: 0.023278, acc: 0.992188
-[Wed Oct 10 16:50:22 2018] epoch_id: 12, batch_id: 800, cost: 0.024731, acc: 0.992188
-[Wed Oct 10 16:50:25 2018] epoch_id: 12, batch_id: 900, cost: 0.033520, acc: 0.992188
-[Wed Oct 10 16:50:27 2018] epoch_id: 12, batch_id: 1000, cost: 0.066168, acc: 0.984375
-[Wed Oct 10 16:50:29 2018] epoch_id: 12, batch_id: 1100, cost: 0.086032, acc: 0.976562
-[Wed Oct 10 16:50:32 2018] epoch_id: 12, batch_id: 1200, cost: 0.041718, acc: 0.968750
-[Wed Oct 10 16:50:34 2018] epoch_id: 12, batch_id: 1300, cost: 0.085903, acc: 0.968750
-[Wed Oct 10 16:50:36 2018] epoch_id: 12, batch_id: 1400, cost: 0.022963, acc: 0.992188
-[Wed Oct 10 16:50:38 2018] epoch_id: 12, batch_id: 1500, cost: 0.008185, acc: 1.000000
-[Wed Oct 10 16:50:41 2018] epoch_id: 12, batch_id: 1600, cost: 0.057872, acc: 0.968750
-[Wed Oct 10 16:50:43 2018] epoch_id: 12, batch_id: 1700, cost: 0.011306, acc: 1.000000
-[Wed Oct 10 16:50:45 2018] epoch_id: 12, batch_id: 1800, cost: 0.030697, acc: 0.984375
-[Wed Oct 10 16:50:47 2018] epoch_id: 12, batch_id: 1900, cost: 0.049713, acc: 0.984375
-[Wed Oct 10 16:50:50 2018] epoch_id: 12, batch_id: 2000, cost: 0.050341, acc: 0.976562
-[Wed Oct 10 16:50:52 2018] epoch_id: 12, batch_id: 2100, cost: 0.024994, acc: 0.992188
-[Wed Oct 10 16:50:54 2018] epoch_id: 12, batch_id: 2200, cost: 0.046852, acc: 0.968750
-[Wed Oct 10 16:50:56 2018] epoch_id: 12, batch_id: 2300, cost: 0.055520, acc: 0.976562
-[Wed Oct 10 16:50:59 2018] epoch_id: 12, batch_id: 2400, cost: 0.085991, acc: 0.968750
-[Wed Oct 10 16:51:01 2018] epoch_id: 12, batch_id: 2500, cost: 0.044263, acc: 0.984375
-[Wed Oct 10 16:51:03 2018] epoch_id: 12, batch_id: 2600, cost: 0.071548, acc: 0.976562
-[Wed Oct 10 16:51:05 2018] epoch_id: 12, batch_id: 2700, cost: 0.039594, acc: 0.976562
-[Wed Oct 10 16:51:08 2018] epoch_id: 12, batch_id: 2800, cost: 0.058939, acc: 0.984375
-[Wed Oct 10 16:51:10 2018] epoch_id: 12, batch_id: 2900, cost: 0.070956, acc: 0.968750
-[Wed Oct 10 16:51:12 2018] epoch_id: 12, batch_id: 3000, cost: 0.059941, acc: 0.960938
-
-[Wed Oct 10 16:51:13 2018] epoch_id: 12, train_avg_cost: 0.044984, train_avg_acc: 0.983741
-[Wed Oct 10 16:51:14 2018] epoch_id: 12, dev_cost: 0.742705, accuracy: 0.8364
-[Wed Oct 10 16:51:14 2018] epoch_id: 12, test_cost: 0.765290, accuracy: 0.8355
-
-[Wed Oct 10 16:51:23 2018] epoch_id: 13, batch_id: 0, cost: 0.054822, acc: 0.968750
-[Wed Oct 10 16:51:25 2018] epoch_id: 13, batch_id: 100, cost: 0.066483, acc: 0.976562
-[Wed Oct 10 16:51:28 2018] epoch_id: 13, batch_id: 200, cost: 0.007064, acc: 1.000000
-[Wed Oct 10 16:51:30 2018] epoch_id: 13, batch_id: 300, cost: 0.050190, acc: 0.984375
-[Wed Oct 10 16:51:32 2018] epoch_id: 13, batch_id: 400, cost: 0.044636, acc: 0.984375
-[Wed Oct 10 16:51:34 2018] epoch_id: 13, batch_id: 500, cost: 0.040963, acc: 0.984375
-[Wed Oct 10 16:51:37 2018] epoch_id: 13, batch_id: 600, cost: 0.029529, acc: 0.992188
-[Wed Oct 10 16:51:39 2018] epoch_id: 13, batch_id: 700, cost: 0.011587, acc: 1.000000
-[Wed Oct 10 16:51:41 2018] epoch_id: 13, batch_id: 800, cost: 0.039673, acc: 0.984375
-[Wed Oct 10 16:51:43 2018] epoch_id: 13, batch_id: 900, cost: 0.028793, acc: 0.984375
-[Wed Oct 10 16:51:46 2018] epoch_id: 13, batch_id: 1000, cost: 0.055973, acc: 0.968750
-[Wed Oct 10 16:51:48 2018] epoch_id: 13, batch_id: 1100, cost: 0.016087, acc: 0.992188
-[Wed Oct 10 16:51:50 2018] epoch_id: 13, batch_id: 1200, cost: 0.096423, acc: 0.960938
-[Wed Oct 10 16:51:52 2018] epoch_id: 13, batch_id: 1300, cost: 0.019652, acc: 0.992188
-[Wed Oct 10 16:51:55 2018] epoch_id: 13, batch_id: 1400, cost: 0.018604, acc: 0.992188
-[Wed Oct 10 16:51:57 2018] epoch_id: 13, batch_id: 1500, cost: 0.060169, acc: 0.960938
-[Wed Oct 10 16:51:59 2018] epoch_id: 13, batch_id: 1600, cost: 0.014124, acc: 0.992188
-[Wed Oct 10 16:52:01 2018] epoch_id: 13, batch_id: 1700, cost: 0.029843, acc: 0.984375
-[Wed Oct 10 16:52:05 2018] epoch_id: 13, batch_id: 1800, cost: 0.063125, acc: 0.976562
-[Wed Oct 10 16:52:07 2018] epoch_id: 13, batch_id: 1900, cost: 0.070910, acc: 0.953125
-[Wed Oct 10 16:52:09 2018] epoch_id: 13, batch_id: 2000, cost: 0.042864, acc: 0.984375
-[Wed Oct 10 16:52:11 2018] epoch_id: 13, batch_id: 2100, cost: 0.014658, acc: 0.992188
-[Wed Oct 10 16:52:14 2018] epoch_id: 13, batch_id: 2200, cost: 0.075003, acc: 0.968750
-[Wed Oct 10 16:52:16 2018] epoch_id: 13, batch_id: 2300, cost: 0.034856, acc: 0.976562
-[Wed Oct 10 16:52:18 2018] epoch_id: 13, batch_id: 2400, cost: 0.040518, acc: 0.976562
-[Wed Oct 10 16:52:20 2018] epoch_id: 13, batch_id: 2500, cost: 0.040826, acc: 0.976562
-[Wed Oct 10 16:52:23 2018] epoch_id: 13, batch_id: 2600, cost: 0.043420, acc: 0.968750
-[Wed Oct 10 16:52:25 2018] epoch_id: 13, batch_id: 2700, cost: 0.027364, acc: 0.984375
-[Wed Oct 10 16:52:27 2018] epoch_id: 13, batch_id: 2800, cost: 0.030051, acc: 0.984375
-[Wed Oct 10 16:52:30 2018] epoch_id: 13, batch_id: 2900, cost: 0.040024, acc: 0.984375
-[Wed Oct 10 16:52:32 2018] epoch_id: 13, batch_id: 3000, cost: 0.054583, acc: 0.968750
-
-[Wed Oct 10 16:52:32 2018] epoch_id: 13, train_avg_cost: 0.041237, train_avg_acc: 0.985349
-[Wed Oct 10 16:52:33 2018] epoch_id: 13, dev_cost: 1.078762, accuracy: 0.8411
-[Wed Oct 10 16:52:34 2018] epoch_id: 13, test_cost: 1.111191, accuracy: 0.8358
-
-[Wed Oct 10 16:52:43 2018] epoch_id: 14, batch_id: 0, cost: 0.003011, acc: 1.000000
-[Wed Oct 10 16:52:45 2018] epoch_id: 14, batch_id: 100, cost: 0.006236, acc: 1.000000
-[Wed Oct 10 16:52:48 2018] epoch_id: 14, batch_id: 200, cost: 0.017501, acc: 0.992188
-[Wed Oct 10 16:52:50 2018] epoch_id: 14, batch_id: 300, cost: 0.062686, acc: 0.976562
-[Wed Oct 10 16:52:52 2018] epoch_id: 14, batch_id: 400, cost: 0.008696, acc: 1.000000
-[Wed Oct 10 16:52:54 2018] epoch_id: 14, batch_id: 500, cost: 0.033238, acc: 0.984375
-[Wed Oct 10 16:52:57 2018] epoch_id: 14, batch_id: 600, cost: 0.086478, acc: 0.976562
-[Wed Oct 10 16:52:59 2018] epoch_id: 14, batch_id: 700, cost: 0.009820, acc: 0.992188
-[Wed Oct 10 16:53:01 2018] epoch_id: 14, batch_id: 800, cost: 0.066287, acc: 0.992188
-[Wed Oct 10 16:53:03 2018] epoch_id: 14, batch_id: 900, cost: 0.004043, acc: 1.000000
-[Wed Oct 10 16:53:05 2018] epoch_id: 14, batch_id: 1000, cost: 0.007859, acc: 1.000000
-[Wed Oct 10 16:53:08 2018] epoch_id: 14, batch_id: 1100, cost: 0.040856, acc: 0.976562
-[Wed Oct 10 16:53:10 2018] epoch_id: 14, batch_id: 1200, cost: 0.038995, acc: 0.984375
-[Wed Oct 10 16:53:12 2018] epoch_id: 14, batch_id: 1300, cost: 0.026738, acc: 0.992188
-[Wed Oct 10 16:53:14 2018] epoch_id: 14, batch_id: 1400, cost: 0.048141, acc: 0.968750
-[Wed Oct 10 16:53:16 2018] epoch_id: 14, batch_id: 1500, cost: 0.081051, acc: 0.976562
-[Wed Oct 10 16:53:19 2018] epoch_id: 14, batch_id: 1600, cost: 0.017602, acc: 0.992188
-[Wed Oct 10 16:53:21 2018] epoch_id: 14, batch_id: 1700, cost: 0.018175, acc: 0.992188
-[Wed Oct 10 16:53:23 2018] epoch_id: 14, batch_id: 1800, cost: 0.076890, acc: 0.968750
-[Wed Oct 10 16:53:25 2018] epoch_id: 14, batch_id: 1900, cost: 0.060768, acc: 0.976562
-[Wed Oct 10 16:53:28 2018] epoch_id: 14, batch_id: 2000, cost: 0.020131, acc: 0.984375
-[Wed Oct 10 16:53:30 2018] epoch_id: 14, batch_id: 2100, cost: 0.077612, acc: 0.976562
-[Wed Oct 10 16:53:32 2018] epoch_id: 14, batch_id: 2200, cost: 0.101997, acc: 0.960938
-[Wed Oct 10 16:53:34 2018] epoch_id: 14, batch_id: 2300, cost: 0.061213, acc: 0.976562
-[Wed Oct 10 16:53:37 2018] epoch_id: 14, batch_id: 2400, cost: 0.048987, acc: 0.976562
-[Wed Oct 10 16:53:39 2018] epoch_id: 14, batch_id: 2500, cost: 0.037741, acc: 0.984375
-[Wed Oct 10 16:53:41 2018] epoch_id: 14, batch_id: 2600, cost: 0.011101, acc: 1.000000
-[Wed Oct 10 16:53:43 2018] epoch_id: 14, batch_id: 2700, cost: 0.019846, acc: 0.992188
-[Wed Oct 10 16:53:45 2018] epoch_id: 14, batch_id: 2800, cost: 0.026633, acc: 1.000000
-[Wed Oct 10 16:53:48 2018] epoch_id: 14, batch_id: 2900, cost: 0.048637, acc: 0.976562
-[Wed Oct 10 16:53:50 2018] epoch_id: 14, batch_id: 3000, cost: 0.056658, acc: 0.992188
-
-[Wed Oct 10 16:53:50 2018] epoch_id: 14, train_avg_cost: 0.037520, train_avg_acc: 0.986595
-[Wed Oct 10 16:53:51 2018] epoch_id: 14, dev_cost: 0.958707, accuracy: 0.8367
-[Wed Oct 10 16:53:52 2018] epoch_id: 14, test_cost: 0.974553, accuracy: 0.8382
-
-[Wed Oct 10 16:54:01 2018] epoch_id: 15, batch_id: 0, cost: 0.015232, acc: 1.000000
-[Wed Oct 10 16:54:04 2018] epoch_id: 15, batch_id: 100, cost: 0.007195, acc: 1.000000
-[Wed Oct 10 16:54:06 2018] epoch_id: 15, batch_id: 200, cost: 0.017140, acc: 0.992188
-[Wed Oct 10 16:54:08 2018] epoch_id: 15, batch_id: 300, cost: 0.003196, acc: 1.000000
-[Wed Oct 10 16:54:10 2018] epoch_id: 15, batch_id: 400, cost: 0.046839, acc: 0.976562
-[Wed Oct 10 16:54:13 2018] epoch_id: 15, batch_id: 500, cost: 0.038533, acc: 0.992188
-[Wed Oct 10 16:54:15 2018] epoch_id: 15, batch_id: 600, cost: 0.016502, acc: 0.992188
-[Wed Oct 10 16:54:17 2018] epoch_id: 15, batch_id: 700, cost: 0.041825, acc: 0.976562
-[Wed Oct 10 16:54:20 2018] epoch_id: 15, batch_id: 800, cost: 0.083583, acc: 0.968750
-[Wed Oct 10 16:54:22 2018] epoch_id: 15, batch_id: 900, cost: 0.013552, acc: 0.992188
-[Wed Oct 10 16:54:24 2018] epoch_id: 15, batch_id: 1000, cost: 0.015114, acc: 1.000000
-[Wed Oct 10 16:54:26 2018] epoch_id: 15, batch_id: 1100, cost: 0.020185, acc: 0.992188
-[Wed Oct 10 16:54:29 2018] epoch_id: 15, batch_id: 1200, cost: 0.023274, acc: 0.984375
-[Wed Oct 10 16:54:31 2018] epoch_id: 15, batch_id: 1300, cost: 0.013836, acc: 1.000000
-[Wed Oct 10 16:54:33 2018] epoch_id: 15, batch_id: 1400, cost: 0.091024, acc: 0.984375
-[Wed Oct 10 16:54:36 2018] epoch_id: 15, batch_id: 1500, cost: 0.047340, acc: 0.976562
-[Wed Oct 10 16:54:38 2018] epoch_id: 15, batch_id: 1600, cost: 0.030423, acc: 0.992188
-[Wed Oct 10 16:54:40 2018] epoch_id: 15, batch_id: 1700, cost: 0.014750, acc: 0.992188
-[Wed Oct 10 16:54:42 2018] epoch_id: 15, batch_id: 1800, cost: 0.090613, acc: 0.968750
-[Wed Oct 10 16:54:45 2018] epoch_id: 15, batch_id: 1900, cost: 0.030791, acc: 0.984375
-[Wed Oct 10 16:54:47 2018] epoch_id: 15, batch_id: 2000, cost: 0.046719, acc: 0.976562
-[Wed Oct 10 16:54:49 2018] epoch_id: 15, batch_id: 2100, cost: 0.043871, acc: 0.984375
-[Wed Oct 10 16:54:51 2018] epoch_id: 15, batch_id: 2200, cost: 0.078455, acc: 0.968750
-[Wed Oct 10 16:54:53 2018] epoch_id: 15, batch_id: 2300, cost: 0.029536, acc: 0.976562
-[Wed Oct 10 16:54:56 2018] epoch_id: 15, batch_id: 2400, cost: 0.028696, acc: 0.984375
-[Wed Oct 10 16:54:58 2018] epoch_id: 15, batch_id: 2500, cost: 0.007129, acc: 0.992188
-[Wed Oct 10 16:55:00 2018] epoch_id: 15, batch_id: 2600, cost: 0.049990, acc: 0.976562
-[Wed Oct 10 16:55:03 2018] epoch_id: 15, batch_id: 2700, cost: 0.040309, acc: 0.984375
-[Wed Oct 10 16:55:06 2018] epoch_id: 15, batch_id: 2800, cost: 0.098748, acc: 0.976562
-[Wed Oct 10 16:55:08 2018] epoch_id: 15, batch_id: 2900, cost: 0.005371, acc: 1.000000
-[Wed Oct 10 16:55:10 2018] epoch_id: 15, batch_id: 3000, cost: 0.060264, acc: 0.960938
-
-[Wed Oct 10 16:55:11 2018] epoch_id: 15, train_avg_cost: 0.034637, train_avg_acc: 0.987582
-[Wed Oct 10 16:55:12 2018] epoch_id: 15, dev_cost: 0.858216, accuracy: 0.8365
-[Wed Oct 10 16:55:13 2018] epoch_id: 15, test_cost: 0.874420, accuracy: 0.8411
-
-[Wed Oct 10 16:55:21 2018] epoch_id: 16, batch_id: 0, cost: 0.013283, acc: 1.000000
-[Wed Oct 10 16:55:23 2018] epoch_id: 16, batch_id: 100, cost: 0.038128, acc: 0.984375
-[Wed Oct 10 16:55:25 2018] epoch_id: 16, batch_id: 200, cost: 0.031110, acc: 0.976562
-[Wed Oct 10 16:55:28 2018] epoch_id: 16, batch_id: 300, cost: 0.005346, acc: 1.000000
-[Wed Oct 10 16:55:30 2018] epoch_id: 16, batch_id: 400, cost: 0.027634, acc: 0.984375
-[Wed Oct 10 16:55:32 2018] epoch_id: 16, batch_id: 500, cost: 0.065929, acc: 0.976562
-[Wed Oct 10 16:55:35 2018] epoch_id: 16, batch_id: 600, cost: 0.012638, acc: 0.992188
-[Wed Oct 10 16:55:37 2018] epoch_id: 16, batch_id: 700, cost: 0.057962, acc: 0.984375
-[Wed Oct 10 16:55:39 2018] epoch_id: 16, batch_id: 800, cost: 0.064390, acc: 0.976562
-[Wed Oct 10 16:55:42 2018] epoch_id: 16, batch_id: 900, cost: 0.018866, acc: 0.992188
-[Wed Oct 10 16:55:44 2018] epoch_id: 16, batch_id: 1000, cost: 0.004791, acc: 1.000000
-[Wed Oct 10 16:55:46 2018] epoch_id: 16, batch_id: 1100, cost: 0.012691, acc: 0.992188
-[Wed Oct 10 16:55:49 2018] epoch_id: 16, batch_id: 1200, cost: 0.033199, acc: 0.992188
-[Wed Oct 10 16:55:51 2018] epoch_id: 16, batch_id: 1300, cost: 0.007757, acc: 1.000000
-[Wed Oct 10 16:55:53 2018] epoch_id: 16, batch_id: 1400, cost: 0.016653, acc: 0.992188
-[Wed Oct 10 16:55:55 2018] epoch_id: 16, batch_id: 1500, cost: 0.034653, acc: 0.968750
-[Wed Oct 10 16:55:58 2018] epoch_id: 16, batch_id: 1600, cost: 0.051049, acc: 0.976562
-[Wed Oct 10 16:56:00 2018] epoch_id: 16, batch_id: 1700, cost: 0.001466, acc: 1.000000
-[Wed Oct 10 16:56:02 2018] epoch_id: 16, batch_id: 1800, cost: 0.035508, acc: 0.992188
-[Wed Oct 10 16:56:05 2018] epoch_id: 16, batch_id: 1900, cost: 0.022919, acc: 0.984375
-[Wed Oct 10 16:56:07 2018] epoch_id: 16, batch_id: 2000, cost: 0.102175, acc: 0.976562
-[Wed Oct 10 16:56:09 2018] epoch_id: 16, batch_id: 2100, cost: 0.012663, acc: 1.000000
-[Wed Oct 10 16:56:11 2018] epoch_id: 16, batch_id: 2200, cost: 0.026142, acc: 0.984375
-[Wed Oct 10 16:56:15 2018] epoch_id: 16, batch_id: 2300, cost: 0.007566, acc: 1.000000
-[Wed Oct 10 16:56:17 2018] epoch_id: 16, batch_id: 2400, cost: 0.043235, acc: 0.976562
-[Wed Oct 10 16:56:20 2018] epoch_id: 16, batch_id: 2500, cost: 0.039383, acc: 0.984375
-[Wed Oct 10 16:56:22 2018] epoch_id: 16, batch_id: 2600, cost: 0.009917, acc: 1.000000
-[Wed Oct 10 16:56:24 2018] epoch_id: 16, batch_id: 2700, cost: 0.036917, acc: 0.984375
-[Wed Oct 10 16:56:26 2018] epoch_id: 16, batch_id: 2800, cost: 0.012813, acc: 1.000000
-[Wed Oct 10 16:56:29 2018] epoch_id: 16, batch_id: 2900, cost: 0.033933, acc: 0.984375
-[Wed Oct 10 16:56:31 2018] epoch_id: 16, batch_id: 3000, cost: 0.007463, acc: 1.000000
-
-[Wed Oct 10 16:56:32 2018] epoch_id: 16, train_avg_cost: 0.031971, train_avg_acc: 0.988555
-[Wed Oct 10 16:56:33 2018] epoch_id: 16, dev_cost: 0.955907, accuracy: 0.8389
-[Wed Oct 10 16:56:34 2018] epoch_id: 16, test_cost: 0.953062, accuracy: 0.8389
-
-[Wed Oct 10 16:56:42 2018] epoch_id: 17, batch_id: 0, cost: 0.031323, acc: 0.992188
-[Wed Oct 10 16:56:44 2018] epoch_id: 17, batch_id: 100, cost: 0.010965, acc: 1.000000
-[Wed Oct 10 16:56:46 2018] epoch_id: 17, batch_id: 200, cost: 0.056771, acc: 0.976562
-[Wed Oct 10 16:56:49 2018] epoch_id: 17, batch_id: 300, cost: 0.026509, acc: 0.992188
-[Wed Oct 10 16:56:51 2018] epoch_id: 17, batch_id: 400, cost: 0.039409, acc: 0.992188
-[Wed Oct 10 16:56:53 2018] epoch_id: 17, batch_id: 500, cost: 0.063554, acc: 0.976562
-[Wed Oct 10 16:56:55 2018] epoch_id: 17, batch_id: 600, cost: 0.035896, acc: 0.976562
-[Wed Oct 10 16:56:58 2018] epoch_id: 17, batch_id: 700, cost: 0.022053, acc: 0.992188
-[Wed Oct 10 16:57:00 2018] epoch_id: 17, batch_id: 800, cost: 0.024150, acc: 0.976562
-[Wed Oct 10 16:57:03 2018] epoch_id: 17, batch_id: 900, cost: 0.009064, acc: 0.992188
-[Wed Oct 10 16:57:05 2018] epoch_id: 17, batch_id: 1000, cost: 0.037311, acc: 0.976562
-[Wed Oct 10 16:57:08 2018] epoch_id: 17, batch_id: 1100, cost: 0.036577, acc: 0.976562
-[Wed Oct 10 16:57:10 2018] epoch_id: 17, batch_id: 1200, cost: 0.020783, acc: 0.992188
-[Wed Oct 10 16:57:12 2018] epoch_id: 17, batch_id: 1300, cost: 0.017610, acc: 0.992188
-[Wed Oct 10 16:57:14 2018] epoch_id: 17, batch_id: 1400, cost: 0.027604, acc: 0.976562
-[Wed Oct 10 16:57:17 2018] epoch_id: 17, batch_id: 1500, cost: 0.040730, acc: 0.992188
-[Wed Oct 10 16:57:19 2018] epoch_id: 17, batch_id: 1600, cost: 0.077946, acc: 0.984375
-[Wed Oct 10 16:57:21 2018] epoch_id: 17, batch_id: 1700, cost: 0.021349, acc: 0.984375
-[Wed Oct 10 16:57:24 2018] epoch_id: 17, batch_id: 1800, cost: 0.016132, acc: 0.992188
-[Wed Oct 10 16:57:26 2018] epoch_id: 17, batch_id: 1900, cost: 0.018797, acc: 0.984375
-[Wed Oct 10 16:57:28 2018] epoch_id: 17, batch_id: 2000, cost: 0.009052, acc: 1.000000
-[Wed Oct 10 16:57:30 2018] epoch_id: 17, batch_id: 2100, cost: 0.028399, acc: 0.992188
-[Wed Oct 10 16:57:33 2018] epoch_id: 17, batch_id: 2200, cost: 0.009593, acc: 1.000000
-[Wed Oct 10 16:57:35 2018] epoch_id: 17, batch_id: 2300, cost: 0.018474, acc: 0.992188
-[Wed Oct 10 16:57:37 2018] epoch_id: 17, batch_id: 2400, cost: 0.007873, acc: 1.000000
-[Wed Oct 10 16:57:40 2018] epoch_id: 17, batch_id: 2500, cost: 0.054923, acc: 0.976562
-[Wed Oct 10 16:57:42 2018] epoch_id: 17, batch_id: 2600, cost: 0.019036, acc: 0.992188
-[Wed Oct 10 16:57:44 2018] epoch_id: 17, batch_id: 2700, cost: 0.017081, acc: 1.000000
-[Wed Oct 10 16:57:46 2018] epoch_id: 17, batch_id: 2800, cost: 0.045522, acc: 0.976562
-[Wed Oct 10 16:57:49 2018] epoch_id: 17, batch_id: 2900, cost: 0.034922, acc: 0.984375
-[Wed Oct 10 16:57:51 2018] epoch_id: 17, batch_id: 3000, cost: 0.039566, acc: 0.984375
-
-[Wed Oct 10 16:57:51 2018] epoch_id: 17, train_avg_cost: 0.030061, train_avg_acc: 0.989478
-[Wed Oct 10 16:57:52 2018] epoch_id: 17, dev_cost: 1.184997, accuracy: 0.8406
-[Wed Oct 10 16:57:53 2018] epoch_id: 17, test_cost: 1.175792, accuracy: 0.8372
-
-[Wed Oct 10 16:58:02 2018] epoch_id: 18, batch_id: 0, cost: 0.015059, acc: 0.992188
-[Wed Oct 10 16:58:04 2018] epoch_id: 18, batch_id: 100, cost: 0.023421, acc: 0.992188
-[Wed Oct 10 16:58:06 2018] epoch_id: 18, batch_id: 200, cost: 0.007234, acc: 1.000000
-[Wed Oct 10 16:58:08 2018] epoch_id: 18, batch_id: 300, cost: 0.007139, acc: 1.000000
-[Wed Oct 10 16:58:10 2018] epoch_id: 18, batch_id: 400, cost: 0.007934, acc: 1.000000
-[Wed Oct 10 16:58:13 2018] epoch_id: 18, batch_id: 500, cost: 0.004312, acc: 1.000000
-[Wed Oct 10 16:58:15 2018] epoch_id: 18, batch_id: 600, cost: 0.001806, acc: 1.000000
-[Wed Oct 10 16:58:17 2018] epoch_id: 18, batch_id: 700, cost: 0.004790, acc: 1.000000
-[Wed Oct 10 16:58:20 2018] epoch_id: 18, batch_id: 800, cost: 0.048477, acc: 0.992188
-[Wed Oct 10 16:58:22 2018] epoch_id: 18, batch_id: 900, cost: 0.066390, acc: 0.992188
-[Wed Oct 10 16:58:24 2018] epoch_id: 18, batch_id: 1000, cost: 0.014440, acc: 0.992188
-[Wed Oct 10 16:58:26 2018] epoch_id: 18, batch_id: 1100, cost: 0.020435, acc: 0.992188
-[Wed Oct 10 16:58:29 2018] epoch_id: 18, batch_id: 1200, cost: 0.007474, acc: 0.992188
-[Wed Oct 10 16:58:31 2018] epoch_id: 18, batch_id: 1300, cost: 0.036209, acc: 0.984375
-[Wed Oct 10 16:58:33 2018] epoch_id: 18, batch_id: 1400, cost: 0.026540, acc: 0.984375
-[Wed Oct 10 16:58:35 2018] epoch_id: 18, batch_id: 1500, cost: 0.019448, acc: 0.992188
-[Wed Oct 10 16:58:38 2018] epoch_id: 18, batch_id: 1600, cost: 0.052421, acc: 0.968750
-[Wed Oct 10 16:58:40 2018] epoch_id: 18, batch_id: 1700, cost: 0.022365, acc: 0.992188
-[Wed Oct 10 16:58:42 2018] epoch_id: 18, batch_id: 1800, cost: 0.135754, acc: 0.984375
-[Wed Oct 10 16:58:45 2018] epoch_id: 18, batch_id: 1900, cost: 0.037197, acc: 0.992188
-[Wed Oct 10 16:58:48 2018] epoch_id: 18, batch_id: 2000, cost: 0.010672, acc: 0.992188
-[Wed Oct 10 16:58:50 2018] epoch_id: 18, batch_id: 2100, cost: 0.012909, acc: 1.000000
-[Wed Oct 10 16:58:52 2018] epoch_id: 18, batch_id: 2200, cost: 0.061615, acc: 0.976562
-[Wed Oct 10 16:58:55 2018] epoch_id: 18, batch_id: 2300, cost: 0.081252, acc: 0.960938
-[Wed Oct 10 16:58:57 2018] epoch_id: 18, batch_id: 2400, cost: 0.009792, acc: 1.000000
-[Wed Oct 10 16:58:59 2018] epoch_id: 18, batch_id: 2500, cost: 0.039835, acc: 0.984375
-[Wed Oct 10 16:59:02 2018] epoch_id: 18, batch_id: 2600, cost: 0.002643, acc: 1.000000
-[Wed Oct 10 16:59:04 2018] epoch_id: 18, batch_id: 2700, cost: 0.017633, acc: 0.992188
-[Wed Oct 10 16:59:06 2018] epoch_id: 18, batch_id: 2800, cost: 0.050407, acc: 0.976562
-[Wed Oct 10 16:59:08 2018] epoch_id: 18, batch_id: 2900, cost: 0.066672, acc: 0.960938
-[Wed Oct 10 16:59:11 2018] epoch_id: 18, batch_id: 3000, cost: 0.023438, acc: 0.984375
-
-[Wed Oct 10 16:59:11 2018] epoch_id: 18, train_avg_cost: 0.028777, train_avg_acc: 0.989884
-[Wed Oct 10 16:59:12 2018] epoch_id: 18, dev_cost: 1.191979, accuracy: 0.8346
-[Wed Oct 10 16:59:13 2018] epoch_id: 18, test_cost: 1.159855, accuracy: 0.8344
-
-[Wed Oct 10 16:59:22 2018] epoch_id: 19, batch_id: 0, cost: 0.023233, acc: 0.992188
-[Wed Oct 10 16:59:24 2018] epoch_id: 19, batch_id: 100, cost: 0.006624, acc: 1.000000
-[Wed Oct 10 16:59:26 2018] epoch_id: 19, batch_id: 200, cost: 0.018784, acc: 0.992188
-[Wed Oct 10 16:59:28 2018] epoch_id: 19, batch_id: 300, cost: 0.012745, acc: 0.992188
-[Wed Oct 10 16:59:31 2018] epoch_id: 19, batch_id: 400, cost: 0.010857, acc: 1.000000
-[Wed Oct 10 16:59:33 2018] epoch_id: 19, batch_id: 500, cost: 0.006066, acc: 1.000000
-[Wed Oct 10 16:59:35 2018] epoch_id: 19, batch_id: 600, cost: 0.014349, acc: 0.992188
-[Wed Oct 10 16:59:38 2018] epoch_id: 19, batch_id: 700, cost: 0.016725, acc: 0.992188
-[Wed Oct 10 16:59:40 2018] epoch_id: 19, batch_id: 800, cost: 0.069121, acc: 0.984375
-[Wed Oct 10 16:59:42 2018] epoch_id: 19, batch_id: 900, cost: 0.018849, acc: 0.984375
-[Wed Oct 10 16:59:44 2018] epoch_id: 19, batch_id: 1000, cost: 0.031679, acc: 0.984375
-[Wed Oct 10 16:59:47 2018] epoch_id: 19, batch_id: 1100, cost: 0.010815, acc: 0.992188
-[Wed Oct 10 16:59:49 2018] epoch_id: 19, batch_id: 1200, cost: 0.015778, acc: 0.992188
-[Wed Oct 10 16:59:51 2018] epoch_id: 19, batch_id: 1300, cost: 0.055160, acc: 0.984375
-[Wed Oct 10 16:59:53 2018] epoch_id: 19, batch_id: 1400, cost: 0.009311, acc: 0.992188
-[Wed Oct 10 16:59:55 2018] epoch_id: 19, batch_id: 1500, cost: 0.014874, acc: 0.992188
-[Wed Oct 10 16:59:58 2018] epoch_id: 19, batch_id: 1600, cost: 0.038188, acc: 0.992188
-[Wed Oct 10 17:00:00 2018] epoch_id: 19, batch_id: 1700, cost: 0.001565, acc: 1.000000
-[Wed Oct 10 17:00:02 2018] epoch_id: 19, batch_id: 1800, cost: 0.013963, acc: 0.992188
-[Wed Oct 10 17:00:04 2018] epoch_id: 19, batch_id: 1900, cost: 0.028362, acc: 0.992188
-[Wed Oct 10 17:00:06 2018] epoch_id: 19, batch_id: 2000, cost: 0.006552, acc: 1.000000
-[Wed Oct 10 17:00:09 2018] epoch_id: 19, batch_id: 2100, cost: 0.045230, acc: 0.992188
-[Wed Oct 10 17:00:11 2018] epoch_id: 19, batch_id: 2200, cost: 0.029525, acc: 0.984375
-[Wed Oct 10 17:00:13 2018] epoch_id: 19, batch_id: 2300, cost: 0.009774, acc: 0.992188
-[Wed Oct 10 17:00:15 2018] epoch_id: 19, batch_id: 2400, cost: 0.003385, acc: 1.000000
-[Wed Oct 10 17:00:18 2018] epoch_id: 19, batch_id: 2500, cost: 0.030629, acc: 0.984375
-[Wed Oct 10 17:00:20 2018] epoch_id: 19, batch_id: 2600, cost: 0.039615, acc: 0.992188
-[Wed Oct 10 17:00:22 2018] epoch_id: 19, batch_id: 2700, cost: 0.016678, acc: 0.992188
-[Wed Oct 10 17:00:24 2018] epoch_id: 19, batch_id: 2800, cost: 0.004723, acc: 1.000000
-[Wed Oct 10 17:00:26 2018] epoch_id: 19, batch_id: 2900, cost: 0.018062, acc: 0.992188
-[Wed Oct 10 17:00:29 2018] epoch_id: 19, batch_id: 3000, cost: 0.032904, acc: 0.984375
-
-[Wed Oct 10 17:00:29 2018] epoch_id: 19, train_avg_cost: 0.026175, train_avg_acc: 0.991055
-[Wed Oct 10 17:00:30 2018] epoch_id: 19, dev_cost: 1.013367, accuracy: 0.8388
-[Wed Oct 10 17:00:31 2018] epoch_id: 19, test_cost: 1.016906, accuracy: 0.8335
-
-[Wed Oct 10 17:00:40 2018] epoch_id: 20, batch_id: 0, cost: 0.019038, acc: 0.992188
-[Wed Oct 10 17:00:42 2018] epoch_id: 20, batch_id: 100, cost: 0.001216, acc: 1.000000
-[Wed Oct 10 17:00:44 2018] epoch_id: 20, batch_id: 200, cost: 0.006635, acc: 1.000000
-[Wed Oct 10 17:00:47 2018] epoch_id: 20, batch_id: 300, cost: 0.051503, acc: 0.984375
-[Wed Oct 10 17:00:49 2018] epoch_id: 20, batch_id: 400, cost: 0.044815, acc: 0.992188
-[Wed Oct 10 17:00:51 2018] epoch_id: 20, batch_id: 500, cost: 0.041529, acc: 0.992188
-[Wed Oct 10 17:00:53 2018] epoch_id: 20, batch_id: 600, cost: 0.010035, acc: 1.000000
-[Wed Oct 10 17:00:56 2018] epoch_id: 20, batch_id: 700, cost: 0.019799, acc: 0.992188
-[Wed Oct 10 17:00:58 2018] epoch_id: 20, batch_id: 800, cost: 0.062296, acc: 0.984375
-[Wed Oct 10 17:01:00 2018] epoch_id: 20, batch_id: 900, cost: 0.015680, acc: 0.992188
-[Wed Oct 10 17:01:03 2018] epoch_id: 20, batch_id: 1000, cost: 0.051963, acc: 0.984375
-[Wed Oct 10 17:01:05 2018] epoch_id: 20, batch_id: 1100, cost: 0.023968, acc: 0.984375
-[Wed Oct 10 17:01:07 2018] epoch_id: 20, batch_id: 1200, cost: 0.079527, acc: 0.984375
-[Wed Oct 10 17:01:09 2018] epoch_id: 20, batch_id: 1300, cost: 0.039612, acc: 0.992188
-[Wed Oct 10 17:01:12 2018] epoch_id: 20, batch_id: 1400, cost: 0.010211, acc: 1.000000
-[Wed Oct 10 17:01:14 2018] epoch_id: 20, batch_id: 1500, cost: 0.012661, acc: 0.992188
-[Wed Oct 10 17:01:16 2018] epoch_id: 20, batch_id: 1600, cost: 0.051475, acc: 0.984375
-[Wed Oct 10 17:01:18 2018] epoch_id: 20, batch_id: 1700, cost: 0.013513, acc: 1.000000
-[Wed Oct 10 17:01:21 2018] epoch_id: 20, batch_id: 1800, cost: 0.006646, acc: 1.000000
-[Wed Oct 10 17:01:23 2018] epoch_id: 20, batch_id: 1900, cost: 0.013369, acc: 0.992188
-[Wed Oct 10 17:01:25 2018] epoch_id: 20, batch_id: 2000, cost: 0.030614, acc: 0.984375
-[Wed Oct 10 17:01:27 2018] epoch_id: 20, batch_id: 2100, cost: 0.003242, acc: 1.000000
-[Wed Oct 10 17:01:30 2018] epoch_id: 20, batch_id: 2200, cost: 0.051409, acc: 0.984375
-[Wed Oct 10 17:01:32 2018] epoch_id: 20, batch_id: 2300, cost: 0.005996, acc: 1.000000
-[Wed Oct 10 17:01:34 2018] epoch_id: 20, batch_id: 2400, cost: 0.049493, acc: 0.976562
-[Wed Oct 10 17:01:36 2018] epoch_id: 20, batch_id: 2500, cost: 0.013635, acc: 0.992188
-[Wed Oct 10 17:01:38 2018] epoch_id: 20, batch_id: 2600, cost: 0.019265, acc: 1.000000
-[Wed Oct 10 17:01:41 2018] epoch_id: 20, batch_id: 2700, cost: 0.040467, acc: 0.976562
-[Wed Oct 10 17:01:44 2018] epoch_id: 20, batch_id: 2800, cost: 0.029407, acc: 0.992188
-[Wed Oct 10 17:01:46 2018] epoch_id: 20, batch_id: 2900, cost: 0.036886, acc: 0.976562
-[Wed Oct 10 17:01:49 2018] epoch_id: 20, batch_id: 3000, cost: 0.018317, acc: 0.992188
-
-[Wed Oct 10 17:01:49 2018] epoch_id: 20, train_avg_cost: 0.025258, train_avg_acc: 0.991367
-[Wed Oct 10 17:01:50 2018] epoch_id: 20, dev_cost: 1.125290, accuracy: 0.8358
-[Wed Oct 10 17:01:51 2018] epoch_id: 20, test_cost: 1.148761, accuracy: 0.832
-
-[Wed Oct 10 17:01:59 2018] epoch_id: 21, batch_id: 0, cost: 0.020581, acc: 0.992188
-[Wed Oct 10 17:02:02 2018] epoch_id: 21, batch_id: 100, cost: 0.021132, acc: 0.992188
-[Wed Oct 10 17:02:04 2018] epoch_id: 21, batch_id: 200, cost: 0.040257, acc: 0.976562
-[Wed Oct 10 17:02:06 2018] epoch_id: 21, batch_id: 300, cost: 0.013450, acc: 1.000000
-[Wed Oct 10 17:02:08 2018] epoch_id: 21, batch_id: 400, cost: 0.027469, acc: 0.992188
-[Wed Oct 10 17:02:11 2018] epoch_id: 21, batch_id: 500, cost: 0.007088, acc: 0.992188
-[Wed Oct 10 17:02:13 2018] epoch_id: 21, batch_id: 600, cost: 0.028169, acc: 0.992188
-[Wed Oct 10 17:02:15 2018] epoch_id: 21, batch_id: 700, cost: 0.067799, acc: 0.984375
-[Wed Oct 10 17:02:17 2018] epoch_id: 21, batch_id: 800, cost: 0.003184, acc: 1.000000
-[Wed Oct 10 17:02:20 2018] epoch_id: 21, batch_id: 900, cost: 0.011056, acc: 0.992188
-[Wed Oct 10 17:02:22 2018] epoch_id: 21, batch_id: 1000, cost: 0.012187, acc: 1.000000
-[Wed Oct 10 17:02:24 2018] epoch_id: 21, batch_id: 1100, cost: 0.009409, acc: 0.992188
-[Wed Oct 10 17:02:26 2018] epoch_id: 21, batch_id: 1200, cost: 0.000739, acc: 1.000000
-[Wed Oct 10 17:02:29 2018] epoch_id: 21, batch_id: 1300, cost: 0.002971, acc: 1.000000
-[Wed Oct 10 17:02:31 2018] epoch_id: 21, batch_id: 1400, cost: 0.031287, acc: 0.984375
-[Wed Oct 10 17:02:33 2018] epoch_id: 21, batch_id: 1500, cost: 0.023455, acc: 0.992188
-[Wed Oct 10 17:02:36 2018] epoch_id: 21, batch_id: 1600, cost: 0.007438, acc: 1.000000
-[Wed Oct 10 17:02:38 2018] epoch_id: 21, batch_id: 1700, cost: 0.035499, acc: 0.968750
-[Wed Oct 10 17:02:40 2018] epoch_id: 21, batch_id: 1800, cost: 0.012515, acc: 1.000000
-[Wed Oct 10 17:02:42 2018] epoch_id: 21, batch_id: 1900, cost: 0.008550, acc: 1.000000
-[Wed Oct 10 17:02:45 2018] epoch_id: 21, batch_id: 2000, cost: 0.051551, acc: 0.992188
-[Wed Oct 10 17:02:47 2018] epoch_id: 21, batch_id: 2100, cost: 0.004980, acc: 1.000000
-[Wed Oct 10 17:02:49 2018] epoch_id: 21, batch_id: 2200, cost: 0.006854, acc: 1.000000
-[Wed Oct 10 17:02:51 2018] epoch_id: 21, batch_id: 2300, cost: 0.071025, acc: 0.968750
-[Wed Oct 10 17:02:55 2018] epoch_id: 21, batch_id: 2400, cost: 0.013599, acc: 1.000000
-[Wed Oct 10 17:02:57 2018] epoch_id: 21, batch_id: 2500, cost: 0.025085, acc: 0.992188
-[Wed Oct 10 17:02:59 2018] epoch_id: 21, batch_id: 2600, cost: 0.018276, acc: 0.984375
-[Wed Oct 10 17:03:01 2018] epoch_id: 21, batch_id: 2700, cost: 0.040565, acc: 0.984375
-[Wed Oct 10 17:03:04 2018] epoch_id: 21, batch_id: 2800, cost: 0.099454, acc: 0.968750
-[Wed Oct 10 17:03:06 2018] epoch_id: 21, batch_id: 2900, cost: 0.017812, acc: 0.992188
-[Wed Oct 10 17:03:08 2018] epoch_id: 21, batch_id: 3000, cost: 0.019825, acc: 0.992188
-
-[Wed Oct 10 17:03:09 2018] epoch_id: 21, train_avg_cost: 0.024180, train_avg_acc: 0.991505
-[Wed Oct 10 17:03:10 2018] epoch_id: 21, dev_cost: 1.413867, accuracy: 0.836
-[Wed Oct 10 17:03:11 2018] epoch_id: 21, test_cost: 1.380237, accuracy: 0.8353
-
-[Wed Oct 10 17:03:19 2018] epoch_id: 22, batch_id: 0, cost: 0.001493, acc: 1.000000
-[Wed Oct 10 17:03:21 2018] epoch_id: 22, batch_id: 100, cost: 0.017211, acc: 0.984375
-[Wed Oct 10 17:03:23 2018] epoch_id: 22, batch_id: 200, cost: 0.015626, acc: 0.992188
-[Wed Oct 10 17:03:25 2018] epoch_id: 22, batch_id: 300, cost: 0.002411, acc: 1.000000
-[Wed Oct 10 17:03:28 2018] epoch_id: 22, batch_id: 400, cost: 0.098118, acc: 0.984375
-[Wed Oct 10 17:03:30 2018] epoch_id: 22, batch_id: 500, cost: 0.031192, acc: 0.992188
-[Wed Oct 10 17:03:32 2018] epoch_id: 22, batch_id: 600, cost: 0.002122, acc: 1.000000
-[Wed Oct 10 17:03:34 2018] epoch_id: 22, batch_id: 700, cost: 0.006148, acc: 1.000000
-[Wed Oct 10 17:03:38 2018] epoch_id: 22, batch_id: 800, cost: 0.007830, acc: 1.000000
-[Wed Oct 10 17:03:40 2018] epoch_id: 22, batch_id: 900, cost: 0.009371, acc: 1.000000
-[Wed Oct 10 17:03:43 2018] epoch_id: 22, batch_id: 1000, cost: 0.024280, acc: 0.984375
-[Wed Oct 10 17:03:45 2018] epoch_id: 22, batch_id: 1100, cost: 0.067847, acc: 0.984375
-[Wed Oct 10 17:03:47 2018] epoch_id: 22, batch_id: 1200, cost: 0.024875, acc: 0.984375
-[Wed Oct 10 17:03:50 2018] epoch_id: 22, batch_id: 1300, cost: 0.004252, acc: 1.000000
-[Wed Oct 10 17:03:52 2018] epoch_id: 22, batch_id: 1400, cost: 0.014934, acc: 0.992188
-[Wed Oct 10 17:03:54 2018] epoch_id: 22, batch_id: 1500, cost: 0.008299, acc: 1.000000
-[Wed Oct 10 17:03:56 2018] epoch_id: 22, batch_id: 1600, cost: 0.007932, acc: 1.000000
-[Wed Oct 10 17:03:59 2018] epoch_id: 22, batch_id: 1700, cost: 0.007008, acc: 1.000000
-[Wed Oct 10 17:04:01 2018] epoch_id: 22, batch_id: 1800, cost: 0.028636, acc: 0.984375
-[Wed Oct 10 17:04:03 2018] epoch_id: 22, batch_id: 1900, cost: 0.012712, acc: 0.992188
-[Wed Oct 10 17:04:05 2018] epoch_id: 22, batch_id: 2000, cost: 0.027561, acc: 0.992188
-[Wed Oct 10 17:04:08 2018] epoch_id: 22, batch_id: 2100, cost: 0.017589, acc: 0.992188
-[Wed Oct 10 17:04:10 2018] epoch_id: 22, batch_id: 2200, cost: 0.016391, acc: 0.992188
-[Wed Oct 10 17:04:12 2018] epoch_id: 22, batch_id: 2300, cost: 0.042172, acc: 0.984375
-[Wed Oct 10 17:04:14 2018] epoch_id: 22, batch_id: 2400, cost: 0.024060, acc: 0.984375
-[Wed Oct 10 17:04:17 2018] epoch_id: 22, batch_id: 2500, cost: 0.014206, acc: 1.000000
-[Wed Oct 10 17:04:19 2018] epoch_id: 22, batch_id: 2600, cost: 0.028562, acc: 0.992188
-[Wed Oct 10 17:04:21 2018] epoch_id: 22, batch_id: 2700, cost: 0.013936, acc: 0.992188
-[Wed Oct 10 17:04:23 2018] epoch_id: 22, batch_id: 2800, cost: 0.023205, acc: 0.984375
-[Wed Oct 10 17:04:26 2018] epoch_id: 22, batch_id: 2900, cost: 0.031024, acc: 0.984375
-[Wed Oct 10 17:04:28 2018] epoch_id: 22, batch_id: 3000, cost: 0.004115, acc: 1.000000
-
-[Wed Oct 10 17:04:29 2018] epoch_id: 22, train_avg_cost: 0.022458, train_avg_acc: 0.992184
-[Wed Oct 10 17:04:30 2018] epoch_id: 22, dev_cost: 1.388674, accuracy: 0.8329
-[Wed Oct 10 17:04:31 2018] epoch_id: 22, test_cost: 1.366122, accuracy: 0.8359
-
-[Wed Oct 10 17:04:39 2018] epoch_id: 23, batch_id: 0, cost: 0.012273, acc: 0.992188
-[Wed Oct 10 17:04:41 2018] epoch_id: 23, batch_id: 100, cost: 0.010904, acc: 0.992188
-[Wed Oct 10 17:04:44 2018] epoch_id: 23, batch_id: 200, cost: 0.001967, acc: 1.000000
-[Wed Oct 10 17:04:46 2018] epoch_id: 23, batch_id: 300, cost: 0.006554, acc: 1.000000
-[Wed Oct 10 17:04:48 2018] epoch_id: 23, batch_id: 400, cost: 0.005179, acc: 1.000000
-[Wed Oct 10 17:04:50 2018] epoch_id: 23, batch_id: 500, cost: 0.014761, acc: 0.992188
-[Wed Oct 10 17:04:53 2018] epoch_id: 23, batch_id: 600, cost: 0.015971, acc: 0.992188
-[Wed Oct 10 17:04:55 2018] epoch_id: 23, batch_id: 700, cost: 0.058416, acc: 0.984375
-[Wed Oct 10 17:04:57 2018] epoch_id: 23, batch_id: 800, cost: 0.005064, acc: 1.000000
-[Wed Oct 10 17:04:59 2018] epoch_id: 23, batch_id: 900, cost: 0.003761, acc: 1.000000
-[Wed Oct 10 17:05:02 2018] epoch_id: 23, batch_id: 1000, cost: 0.002844, acc: 1.000000
-[Wed Oct 10 17:05:04 2018] epoch_id: 23, batch_id: 1100, cost: 0.010259, acc: 1.000000
-[Wed Oct 10 17:05:06 2018] epoch_id: 23, batch_id: 1200, cost: 0.005445, acc: 1.000000
-[Wed Oct 10 17:05:09 2018] epoch_id: 23, batch_id: 1300, cost: 0.018197, acc: 0.992188
-[Wed Oct 10 17:05:11 2018] epoch_id: 23, batch_id: 1400, cost: 0.016600, acc: 0.992188
-[Wed Oct 10 17:05:13 2018] epoch_id: 23, batch_id: 1500, cost: 0.047691, acc: 0.992188
-[Wed Oct 10 17:05:15 2018] epoch_id: 23, batch_id: 1600, cost: 0.084442, acc: 0.984375
-[Wed Oct 10 17:05:18 2018] epoch_id: 23, batch_id: 1700, cost: 0.044283, acc: 0.992188
-[Wed Oct 10 17:05:21 2018] epoch_id: 23, batch_id: 1800, cost: 0.120200, acc: 0.984375
-[Wed Oct 10 17:05:23 2018] epoch_id: 23, batch_id: 1900, cost: 0.013874, acc: 0.992188
-[Wed Oct 10 17:05:26 2018] epoch_id: 23, batch_id: 2000, cost: 0.027709, acc: 0.984375
-[Wed Oct 10 17:05:28 2018] epoch_id: 23, batch_id: 2100, cost: 0.017088, acc: 0.992188
-[Wed Oct 10 17:05:30 2018] epoch_id: 23, batch_id: 2200, cost: 0.049081, acc: 0.976562
-[Wed Oct 10 17:05:32 2018] epoch_id: 23, batch_id: 2300, cost: 0.013016, acc: 0.992188
-[Wed Oct 10 17:05:35 2018] epoch_id: 23, batch_id: 2400, cost: 0.015467, acc: 0.992188
-[Wed Oct 10 17:05:37 2018] epoch_id: 23, batch_id: 2500, cost: 0.002745, acc: 1.000000
-[Wed Oct 10 17:05:39 2018] epoch_id: 23, batch_id: 2600, cost: 0.002618, acc: 1.000000
-[Wed Oct 10 17:05:42 2018] epoch_id: 23, batch_id: 2700, cost: 0.010789, acc: 1.000000
-[Wed Oct 10 17:05:44 2018] epoch_id: 23, batch_id: 2800, cost: 0.026513, acc: 0.984375
-[Wed Oct 10 17:05:46 2018] epoch_id: 23, batch_id: 2900, cost: 0.056513, acc: 0.984375
-[Wed Oct 10 17:05:49 2018] epoch_id: 23, batch_id: 3000, cost: 0.007607, acc: 1.000000
-
-[Wed Oct 10 17:05:49 2018] epoch_id: 23, train_avg_cost: 0.021786, train_avg_acc: 0.992707
-[Wed Oct 10 17:05:50 2018] epoch_id: 23, dev_cost: 1.181561, accuracy: 0.8368
-[Wed Oct 10 17:05:51 2018] epoch_id: 23, test_cost: 1.209735, accuracy: 0.8339
-
-[Wed Oct 10 17:06:00 2018] epoch_id: 24, batch_id: 0, cost: 0.005431, acc: 1.000000
-[Wed Oct 10 17:06:02 2018] epoch_id: 24, batch_id: 100, cost: 0.017588, acc: 0.984375
-[Wed Oct 10 17:06:04 2018] epoch_id: 24, batch_id: 200, cost: 0.078571, acc: 0.976562
-[Wed Oct 10 17:06:06 2018] epoch_id: 24, batch_id: 300, cost: 0.003192, acc: 1.000000
-[Wed Oct 10 17:06:09 2018] epoch_id: 24, batch_id: 400, cost: 0.008610, acc: 1.000000
-[Wed Oct 10 17:06:11 2018] epoch_id: 24, batch_id: 500, cost: 0.010603, acc: 0.992188
-[Wed Oct 10 17:06:13 2018] epoch_id: 24, batch_id: 600, cost: 0.068159, acc: 0.984375
-[Wed Oct 10 17:06:15 2018] epoch_id: 24, batch_id: 700, cost: 0.031611, acc: 0.992188
-[Wed Oct 10 17:06:18 2018] epoch_id: 24, batch_id: 800, cost: 0.005276, acc: 1.000000
-[Wed Oct 10 17:06:20 2018] epoch_id: 24, batch_id: 900, cost: 0.019978, acc: 0.992188
-[Wed Oct 10 17:06:22 2018] epoch_id: 24, batch_id: 1000, cost: 0.061957, acc: 0.992188
-[Wed Oct 10 17:06:25 2018] epoch_id: 24, batch_id: 1100, cost: 0.015165, acc: 0.992188
-[Wed Oct 10 17:06:27 2018] epoch_id: 24, batch_id: 1200, cost: 0.052448, acc: 0.976562
-[Wed Oct 10 17:06:29 2018] epoch_id: 24, batch_id: 1300, cost: 0.003287, acc: 1.000000
-[Wed Oct 10 17:06:31 2018] epoch_id: 24, batch_id: 1400, cost: 0.027564, acc: 0.992188
-[Wed Oct 10 17:06:34 2018] epoch_id: 24, batch_id: 1500, cost: 0.002861, acc: 1.000000
-[Wed Oct 10 17:06:36 2018] epoch_id: 24, batch_id: 1600, cost: 0.022500, acc: 0.992188
-[Wed Oct 10 17:06:38 2018] epoch_id: 24, batch_id: 1700, cost: 0.041690, acc: 0.984375
-[Wed Oct 10 17:06:40 2018] epoch_id: 24, batch_id: 1800, cost: 0.016889, acc: 0.992188
-[Wed Oct 10 17:06:43 2018] epoch_id: 24, batch_id: 1900, cost: 0.026357, acc: 0.992188
-[Wed Oct 10 17:06:45 2018] epoch_id: 24, batch_id: 2000, cost: 0.035357, acc: 0.984375
-[Wed Oct 10 17:06:47 2018] epoch_id: 24, batch_id: 2100, cost: 0.070517, acc: 0.960938
-[Wed Oct 10 17:06:49 2018] epoch_id: 24, batch_id: 2200, cost: 0.021093, acc: 0.984375
-[Wed Oct 10 17:06:52 2018] epoch_id: 24, batch_id: 2300, cost: 0.003296, acc: 1.000000
-[Wed Oct 10 17:06:54 2018] epoch_id: 24, batch_id: 2400, cost: 0.002669, acc: 1.000000
-[Wed Oct 10 17:06:56 2018] epoch_id: 24, batch_id: 2500, cost: 0.047008, acc: 0.976562
-[Wed Oct 10 17:06:58 2018] epoch_id: 24, batch_id: 2600, cost: 0.015561, acc: 0.992188
-[Wed Oct 10 17:07:00 2018] epoch_id: 24, batch_id: 2700, cost: 0.074711, acc: 0.984375
-[Wed Oct 10 17:07:03 2018] epoch_id: 24, batch_id: 2800, cost: 0.021376, acc: 0.992188
-[Wed Oct 10 17:07:05 2018] epoch_id: 24, batch_id: 2900, cost: 0.013928, acc: 1.000000
-[Wed Oct 10 17:07:07 2018] epoch_id: 24, batch_id: 3000, cost: 0.019474, acc: 0.992188
-
-[Wed Oct 10 17:07:07 2018] epoch_id: 24, train_avg_cost: 0.020611, train_avg_acc: 0.992913
-[Wed Oct 10 17:07:08 2018] epoch_id: 24, dev_cost: 1.249092, accuracy: 0.8329
-[Wed Oct 10 17:07:09 2018] epoch_id: 24, test_cost: 1.206091, accuracy: 0.8348
-
-[Wed Oct 10 17:07:18 2018] epoch_id: 25, batch_id: 0, cost: 0.009832, acc: 1.000000
-[Wed Oct 10 17:07:21 2018] epoch_id: 25, batch_id: 100, cost: 0.007028, acc: 1.000000
-[Wed Oct 10 17:07:23 2018] epoch_id: 25, batch_id: 200, cost: 0.029548, acc: 0.984375
-[Wed Oct 10 17:07:25 2018] epoch_id: 25, batch_id: 300, cost: 0.001753, acc: 1.000000
-[Wed Oct 10 17:07:28 2018] epoch_id: 25, batch_id: 400, cost: 0.001457, acc: 1.000000
-[Wed Oct 10 17:07:30 2018] epoch_id: 25, batch_id: 500, cost: 0.004209, acc: 1.000000
-[Wed Oct 10 17:07:32 2018] epoch_id: 25, batch_id: 600, cost: 0.002758, acc: 1.000000
-[Wed Oct 10 17:07:35 2018] epoch_id: 25, batch_id: 700, cost: 0.039204, acc: 0.984375
-[Wed Oct 10 17:07:37 2018] epoch_id: 25, batch_id: 800, cost: 0.004454, acc: 1.000000
-[Wed Oct 10 17:07:39 2018] epoch_id: 25, batch_id: 900, cost: 0.005273, acc: 1.000000
-[Wed Oct 10 17:07:41 2018] epoch_id: 25, batch_id: 1000, cost: 0.008021, acc: 0.992188
-[Wed Oct 10 17:07:44 2018] epoch_id: 25, batch_id: 1100, cost: 0.037441, acc: 0.976562
-[Wed Oct 10 17:07:46 2018] epoch_id: 25, batch_id: 1200, cost: 0.011153, acc: 1.000000
-[Wed Oct 10 17:07:48 2018] epoch_id: 25, batch_id: 1300, cost: 0.064342, acc: 0.992188
-[Wed Oct 10 17:07:50 2018] epoch_id: 25, batch_id: 1400, cost: 0.036600, acc: 0.992188
-[Wed Oct 10 17:07:53 2018] epoch_id: 25, batch_id: 1500, cost: 0.046661, acc: 0.992188
-[Wed Oct 10 17:07:55 2018] epoch_id: 25, batch_id: 1600, cost: 0.015580, acc: 1.000000
-[Wed Oct 10 17:07:57 2018] epoch_id: 25, batch_id: 1700, cost: 0.008311, acc: 1.000000
-[Wed Oct 10 17:07:59 2018] epoch_id: 25, batch_id: 1800, cost: 0.004560, acc: 1.000000
-[Wed Oct 10 17:08:02 2018] epoch_id: 25, batch_id: 1900, cost: 0.012200, acc: 1.000000
-[Wed Oct 10 17:08:04 2018] epoch_id: 25, batch_id: 2000, cost: 0.006555, acc: 1.000000
-[Wed Oct 10 17:08:06 2018] epoch_id: 25, batch_id: 2100, cost: 0.028259, acc: 0.992188
-[Wed Oct 10 17:08:08 2018] epoch_id: 25, batch_id: 2200, cost: 0.003801, acc: 1.000000
-[Wed Oct 10 17:08:11 2018] epoch_id: 25, batch_id: 2300, cost: 0.004532, acc: 1.000000
-[Wed Oct 10 17:08:13 2018] epoch_id: 25, batch_id: 2400, cost: 0.008551, acc: 1.000000
-[Wed Oct 10 17:08:15 2018] epoch_id: 25, batch_id: 2500, cost: 0.013781, acc: 0.992188
-[Wed Oct 10 17:08:17 2018] epoch_id: 25, batch_id: 2600, cost: 0.024098, acc: 0.992188
-[Wed Oct 10 17:08:21 2018] epoch_id: 25, batch_id: 2700, cost: 0.009117, acc: 0.992188
-[Wed Oct 10 17:08:23 2018] epoch_id: 25, batch_id: 2800, cost: 0.032231, acc: 0.984375
-[Wed Oct 10 17:08:25 2018] epoch_id: 25, batch_id: 2900, cost: 0.004502, acc: 1.000000
-[Wed Oct 10 17:08:28 2018] epoch_id: 25, batch_id: 3000, cost: 0.006727, acc: 1.000000
-
-[Wed Oct 10 17:08:28 2018] epoch_id: 25, train_avg_cost: 0.020529, train_avg_acc: 0.993019
-[Wed Oct 10 17:08:29 2018] epoch_id: 25, dev_cost: 1.238637, accuracy: 0.8323
-[Wed Oct 10 17:08:30 2018] epoch_id: 25, test_cost: 1.213099, accuracy: 0.8345
-
-[Wed Oct 10 17:08:38 2018] epoch_id: 26, batch_id: 0, cost: 0.040923, acc: 0.992188
-[Wed Oct 10 17:08:40 2018] epoch_id: 26, batch_id: 100, cost: 0.003892, acc: 1.000000
-[Wed Oct 10 17:08:43 2018] epoch_id: 26, batch_id: 200, cost: 0.005719, acc: 1.000000
-[Wed Oct 10 17:08:45 2018] epoch_id: 26, batch_id: 300, cost: 0.011791, acc: 1.000000
-[Wed Oct 10 17:08:47 2018] epoch_id: 26, batch_id: 400, cost: 0.015297, acc: 0.992188
-[Wed Oct 10 17:08:49 2018] epoch_id: 26, batch_id: 500, cost: 0.067796, acc: 0.984375
-[Wed Oct 10 17:08:52 2018] epoch_id: 26, batch_id: 600, cost: 0.041215, acc: 0.992188
-[Wed Oct 10 17:08:54 2018] epoch_id: 26, batch_id: 700, cost: 0.017786, acc: 0.984375
-[Wed Oct 10 17:08:56 2018] epoch_id: 26, batch_id: 800, cost: 0.033173, acc: 0.992188
-[Wed Oct 10 17:08:59 2018] epoch_id: 26, batch_id: 900, cost: 0.007282, acc: 0.992188
-[Wed Oct 10 17:09:01 2018] epoch_id: 26, batch_id: 1000, cost: 0.028577, acc: 0.992188
-[Wed Oct 10 17:09:03 2018] epoch_id: 26, batch_id: 1100, cost: 0.017994, acc: 0.992188
-[Wed Oct 10 17:09:05 2018] epoch_id: 26, batch_id: 1200, cost: 0.005319, acc: 1.000000
-[Wed Oct 10 17:09:08 2018] epoch_id: 26, batch_id: 1300, cost: 0.030209, acc: 0.992188
-[Wed Oct 10 17:09:10 2018] epoch_id: 26, batch_id: 1400, cost: 0.012992, acc: 0.992188
-[Wed Oct 10 17:09:12 2018] epoch_id: 26, batch_id: 1500, cost: 0.014228, acc: 0.992188
-[Wed Oct 10 17:09:15 2018] epoch_id: 26, batch_id: 1600, cost: 0.008148, acc: 1.000000
-[Wed Oct 10 17:09:17 2018] epoch_id: 26, batch_id: 1700, cost: 0.003299, acc: 1.000000
-[Wed Oct 10 17:09:19 2018] epoch_id: 26, batch_id: 1800, cost: 0.026134, acc: 0.992188
-[Wed Oct 10 17:09:22 2018] epoch_id: 26, batch_id: 1900, cost: 0.016610, acc: 1.000000
-[Wed Oct 10 17:09:24 2018] epoch_id: 26, batch_id: 2000, cost: 0.019105, acc: 0.992188
-[Wed Oct 10 17:09:26 2018] epoch_id: 26, batch_id: 2100, cost: 0.004593, acc: 1.000000
-[Wed Oct 10 17:09:28 2018] epoch_id: 26, batch_id: 2200, cost: 0.036595, acc: 0.992188
-[Wed Oct 10 17:09:32 2018] epoch_id: 26, batch_id: 2300, cost: 0.003857, acc: 1.000000
-[Wed Oct 10 17:09:34 2018] epoch_id: 26, batch_id: 2400, cost: 0.002700, acc: 1.000000
-[Wed Oct 10 17:09:36 2018] epoch_id: 26, batch_id: 2500, cost: 0.002269, acc: 1.000000
-[Wed Oct 10 17:09:38 2018] epoch_id: 26, batch_id: 2600, cost: 0.022186, acc: 0.992188
-[Wed Oct 10 17:09:41 2018] epoch_id: 26, batch_id: 2700, cost: 0.035991, acc: 0.976562
-[Wed Oct 10 17:09:43 2018] epoch_id: 26, batch_id: 2800, cost: 0.005430, acc: 1.000000
-[Wed Oct 10 17:09:45 2018] epoch_id: 26, batch_id: 2900, cost: 0.017578, acc: 0.992188
-[Wed Oct 10 17:09:47 2018] epoch_id: 26, batch_id: 3000, cost: 0.030596, acc: 0.984375
-
-[Wed Oct 10 17:09:48 2018] epoch_id: 26, train_avg_cost: 0.019528, train_avg_acc: 0.993425
-[Wed Oct 10 17:09:49 2018] epoch_id: 26, dev_cost: 1.452644, accuracy: 0.8334
-[Wed Oct 10 17:09:50 2018] epoch_id: 26, test_cost: 1.449995, accuracy: 0.8329
-
-[Wed Oct 10 17:09:58 2018] epoch_id: 27, batch_id: 0, cost: 0.006640, acc: 1.000000
-[Wed Oct 10 17:10:00 2018] epoch_id: 27, batch_id: 100, cost: 0.001101, acc: 1.000000
-[Wed Oct 10 17:10:02 2018] epoch_id: 27, batch_id: 200, cost: 0.019329, acc: 0.992188
-[Wed Oct 10 17:10:05 2018] epoch_id: 27, batch_id: 300, cost: 0.002996, acc: 1.000000
-[Wed Oct 10 17:10:07 2018] epoch_id: 27, batch_id: 400, cost: 0.002077, acc: 1.000000
-[Wed Oct 10 17:10:09 2018] epoch_id: 27, batch_id: 500, cost: 0.007058, acc: 1.000000
-[Wed Oct 10 17:10:11 2018] epoch_id: 27, batch_id: 600, cost: 0.002119, acc: 1.000000
-[Wed Oct 10 17:10:14 2018] epoch_id: 27, batch_id: 700, cost: 0.039876, acc: 0.984375
-[Wed Oct 10 17:10:16 2018] epoch_id: 27, batch_id: 800, cost: 0.010680, acc: 1.000000
-[Wed Oct 10 17:10:19 2018] epoch_id: 27, batch_id: 900, cost: 0.004508, acc: 1.000000
-[Wed Oct 10 17:10:21 2018] epoch_id: 27, batch_id: 1000, cost: 0.029683, acc: 0.984375
-[Wed Oct 10 17:10:24 2018] epoch_id: 27, batch_id: 1100, cost: 0.011985, acc: 1.000000
-[Wed Oct 10 17:10:26 2018] epoch_id: 27, batch_id: 1200, cost: 0.004091, acc: 1.000000
-[Wed Oct 10 17:10:28 2018] epoch_id: 27, batch_id: 1300, cost: 0.028585, acc: 0.984375
-[Wed Oct 10 17:10:30 2018] epoch_id: 27, batch_id: 1400, cost: 0.001462, acc: 1.000000
-[Wed Oct 10 17:10:33 2018] epoch_id: 27, batch_id: 1500, cost: 0.033079, acc: 0.992188
-[Wed Oct 10 17:10:35 2018] epoch_id: 27, batch_id: 1600, cost: 0.017679, acc: 0.992188
-[Wed Oct 10 17:10:37 2018] epoch_id: 27, batch_id: 1700, cost: 0.000921, acc: 1.000000
-[Wed Oct 10 17:10:39 2018] epoch_id: 27, batch_id: 1800, cost: 0.029850, acc: 0.984375
-[Wed Oct 10 17:10:42 2018] epoch_id: 27, batch_id: 1900, cost: 0.005679, acc: 1.000000
-[Wed Oct 10 17:10:44 2018] epoch_id: 27, batch_id: 2000, cost: 0.007635, acc: 0.992188
-[Wed Oct 10 17:10:46 2018] epoch_id: 27, batch_id: 2100, cost: 0.056935, acc: 0.984375
-[Wed Oct 10 17:10:48 2018] epoch_id: 27, batch_id: 2200, cost: 0.014361, acc: 1.000000
-[Wed Oct 10 17:10:51 2018] epoch_id: 27, batch_id: 2300, cost: 0.040282, acc: 0.984375
-[Wed Oct 10 17:10:53 2018] epoch_id: 27, batch_id: 2400, cost: 0.004073, acc: 1.000000
-[Wed Oct 10 17:10:55 2018] epoch_id: 27, batch_id: 2500, cost: 0.013922, acc: 0.984375
-[Wed Oct 10 17:10:57 2018] epoch_id: 27, batch_id: 2600, cost: 0.018309, acc: 0.992188
-[Wed Oct 10 17:10:59 2018] epoch_id: 27, batch_id: 2700, cost: 0.011584, acc: 0.992188
-[Wed Oct 10 17:11:02 2018] epoch_id: 27, batch_id: 2800, cost: 0.018637, acc: 0.992188
-[Wed Oct 10 17:11:04 2018] epoch_id: 27, batch_id: 2900, cost: 0.013617, acc: 0.992188
-[Wed Oct 10 17:11:06 2018] epoch_id: 27, batch_id: 3000, cost: 0.079333, acc: 0.976562
-
-[Wed Oct 10 17:11:07 2018] epoch_id: 27, train_avg_cost: 0.018039, train_avg_acc: 0.993701
-[Wed Oct 10 17:11:08 2018] epoch_id: 27, dev_cost: 1.463991, accuracy: 0.8333
-[Wed Oct 10 17:11:09 2018] epoch_id: 27, test_cost: 1.450415, accuracy: 0.8334
-
-[Wed Oct 10 17:11:17 2018] epoch_id: 28, batch_id: 0, cost: 0.023539, acc: 0.984375
-[Wed Oct 10 17:11:20 2018] epoch_id: 28, batch_id: 100, cost: 0.005577, acc: 1.000000
-[Wed Oct 10 17:11:22 2018] epoch_id: 28, batch_id: 200, cost: 0.001478, acc: 1.000000
-[Wed Oct 10 17:11:24 2018] epoch_id: 28, batch_id: 300, cost: 0.005870, acc: 1.000000
-[Wed Oct 10 17:11:26 2018] epoch_id: 28, batch_id: 400, cost: 0.021292, acc: 0.992188
-[Wed Oct 10 17:11:29 2018] epoch_id: 28, batch_id: 500, cost: 0.032081, acc: 0.984375
-[Wed Oct 10 17:11:31 2018] epoch_id: 28, batch_id: 600, cost: 0.004568, acc: 1.000000
-[Wed Oct 10 17:11:33 2018] epoch_id: 28, batch_id: 700, cost: 0.006552, acc: 1.000000
-[Wed Oct 10 17:11:35 2018] epoch_id: 28, batch_id: 800, cost: 0.012579, acc: 0.992188
-[Wed Oct 10 17:11:38 2018] epoch_id: 28, batch_id: 900, cost: 0.004214, acc: 1.000000
-[Wed Oct 10 17:11:40 2018] epoch_id: 28, batch_id: 1000, cost: 0.023843, acc: 0.984375
-[Wed Oct 10 17:11:42 2018] epoch_id: 28, batch_id: 1100, cost: 0.017869, acc: 0.992188
-[Wed Oct 10 17:11:44 2018] epoch_id: 28, batch_id: 1200, cost: 0.045617, acc: 0.984375
-[Wed Oct 10 17:11:46 2018] epoch_id: 28, batch_id: 1300, cost: 0.012739, acc: 0.992188
-[Wed Oct 10 17:11:49 2018] epoch_id: 28, batch_id: 1400, cost: 0.020053, acc: 0.992188
-[Wed Oct 10 17:11:51 2018] epoch_id: 28, batch_id: 1500, cost: 0.006956, acc: 1.000000
-[Wed Oct 10 17:11:53 2018] epoch_id: 28, batch_id: 1600, cost: 0.022830, acc: 0.984375
-[Wed Oct 10 17:11:55 2018] epoch_id: 28, batch_id: 1700, cost: 0.008924, acc: 1.000000
-[Wed Oct 10 17:11:58 2018] epoch_id: 28, batch_id: 1800, cost: 0.013902, acc: 0.992188
-[Wed Oct 10 17:12:01 2018] epoch_id: 28, batch_id: 1900, cost: 0.026418, acc: 0.984375
-[Wed Oct 10 17:12:03 2018] epoch_id: 28, batch_id: 2000, cost: 0.006809, acc: 1.000000
-[Wed Oct 10 17:12:05 2018] epoch_id: 28, batch_id: 2100, cost: 0.041039, acc: 0.984375
-[Wed Oct 10 17:12:08 2018] epoch_id: 28, batch_id: 2200, cost: 0.023235, acc: 0.992188
-[Wed Oct 10 17:12:10 2018] epoch_id: 28, batch_id: 2300, cost: 0.057685, acc: 0.976562
-[Wed Oct 10 17:12:12 2018] epoch_id: 28, batch_id: 2400, cost: 0.012688, acc: 1.000000
-[Wed Oct 10 17:12:14 2018] epoch_id: 28, batch_id: 2500, cost: 0.010697, acc: 0.992188
-[Wed Oct 10 17:12:16 2018] epoch_id: 28, batch_id: 2600, cost: 0.025213, acc: 0.992188
-[Wed Oct 10 17:12:19 2018] epoch_id: 28, batch_id: 2700, cost: 0.011269, acc: 0.992188
-[Wed Oct 10 17:12:21 2018] epoch_id: 28, batch_id: 2800, cost: 0.001141, acc: 1.000000
-[Wed Oct 10 17:12:23 2018] epoch_id: 28, batch_id: 2900, cost: 0.049410, acc: 0.984375
-[Wed Oct 10 17:12:25 2018] epoch_id: 28, batch_id: 3000, cost: 0.019739, acc: 0.992188
-
-[Wed Oct 10 17:12:26 2018] epoch_id: 28, train_avg_cost: 0.018105, train_avg_acc: 0.993756
-[Wed Oct 10 17:12:27 2018] epoch_id: 28, dev_cost: 1.200318, accuracy: 0.8345
-[Wed Oct 10 17:12:28 2018] epoch_id: 28, test_cost: 1.228304, accuracy: 0.8308
-
-[Wed Oct 10 17:12:36 2018] epoch_id: 29, batch_id: 0, cost: 0.004694, acc: 1.000000
-[Wed Oct 10 17:12:39 2018] epoch_id: 29, batch_id: 100, cost: 0.008528, acc: 0.992188
-[Wed Oct 10 17:12:41 2018] epoch_id: 29, batch_id: 200, cost: 0.006778, acc: 0.992188
-[Wed Oct 10 17:12:43 2018] epoch_id: 29, batch_id: 300, cost: 0.026610, acc: 0.992188
-[Wed Oct 10 17:12:45 2018] epoch_id: 29, batch_id: 400, cost: 0.008479, acc: 1.000000
-[Wed Oct 10 17:12:47 2018] epoch_id: 29, batch_id: 500, cost: 0.021705, acc: 0.984375
-[Wed Oct 10 17:12:50 2018] epoch_id: 29, batch_id: 600, cost: 0.010583, acc: 0.992188
-[Wed Oct 10 17:12:52 2018] epoch_id: 29, batch_id: 700, cost: 0.056105, acc: 0.992188
-[Wed Oct 10 17:12:54 2018] epoch_id: 29, batch_id: 800, cost: 0.000675, acc: 1.000000
-[Wed Oct 10 17:12:56 2018] epoch_id: 29, batch_id: 900, cost: 0.011277, acc: 1.000000
-[Wed Oct 10 17:12:58 2018] epoch_id: 29, batch_id: 1000, cost: 0.006004, acc: 1.000000
-[Wed Oct 10 17:13:01 2018] epoch_id: 29, batch_id: 1100, cost: 0.000914, acc: 1.000000
-[Wed Oct 10 17:13:03 2018] epoch_id: 29, batch_id: 1200, cost: 0.001097, acc: 1.000000
-[Wed Oct 10 17:13:05 2018] epoch_id: 29, batch_id: 1300, cost: 0.002556, acc: 1.000000
-[Wed Oct 10 17:13:07 2018] epoch_id: 29, batch_id: 1400, cost: 0.005061, acc: 1.000000
-[Wed Oct 10 17:13:10 2018] epoch_id: 29, batch_id: 1500, cost: 0.002417, acc: 1.000000
-[Wed Oct 10 17:13:12 2018] epoch_id: 29, batch_id: 1600, cost: 0.001037, acc: 1.000000
-[Wed Oct 10 17:13:14 2018] epoch_id: 29, batch_id: 1700, cost: 0.003415, acc: 1.000000
-[Wed Oct 10 17:13:16 2018] epoch_id: 29, batch_id: 1800, cost: 0.033230, acc: 0.984375
-[Wed Oct 10 17:13:19 2018] epoch_id: 29, batch_id: 1900, cost: 0.002914, acc: 1.000000
-[Wed Oct 10 17:13:21 2018] epoch_id: 29, batch_id: 2000, cost: 0.036463, acc: 0.984375
-[Wed Oct 10 17:13:23 2018] epoch_id: 29, batch_id: 2100, cost: 0.067978, acc: 0.976562
-[Wed Oct 10 17:13:25 2018] epoch_id: 29, batch_id: 2200, cost: 0.028088, acc: 0.992188
-[Wed Oct 10 17:13:28 2018] epoch_id: 29, batch_id: 2300, cost: 0.013688, acc: 0.992188
-[Wed Oct 10 17:13:30 2018] epoch_id: 29, batch_id: 2400, cost: 0.000238, acc: 1.000000
-[Wed Oct 10 17:13:32 2018] epoch_id: 29, batch_id: 2500, cost: 0.006287, acc: 1.000000
-[Wed Oct 10 17:13:35 2018] epoch_id: 29, batch_id: 2600, cost: 0.058838, acc: 0.992188
-[Wed Oct 10 17:13:37 2018] epoch_id: 29, batch_id: 2700, cost: 0.013440, acc: 0.992188
-[Wed Oct 10 17:13:39 2018] epoch_id: 29, batch_id: 2800, cost: 0.002577, acc: 1.000000
-[Wed Oct 10 17:13:41 2018] epoch_id: 29, batch_id: 2900, cost: 0.020076, acc: 0.992188
-[Wed Oct 10 17:13:43 2018] epoch_id: 29, batch_id: 3000, cost: 0.025126, acc: 0.992188
-
-[Wed Oct 10 17:13:44 2018] epoch_id: 29, train_avg_cost: 0.017397, train_avg_acc: 0.994107
-[Wed Oct 10 17:13:45 2018] epoch_id: 29, dev_cost: 1.314838, accuracy: 0.8304
-[Wed Oct 10 17:13:46 2018] epoch_id: 29, test_cost: 1.349980, accuracy: 0.8298
-
-[Wed Oct 10 17:13:55 2018] epoch_id: 30, batch_id: 0, cost: 0.063661, acc: 0.984375
-[Wed Oct 10 17:13:57 2018] epoch_id: 30, batch_id: 100, cost: 0.005445, acc: 1.000000
-[Wed Oct 10 17:13:59 2018] epoch_id: 30, batch_id: 200, cost: 0.025451, acc: 0.984375
-[Wed Oct 10 17:14:01 2018] epoch_id: 30, batch_id: 300, cost: 0.019455, acc: 0.992188
-[Wed Oct 10 17:14:04 2018] epoch_id: 30, batch_id: 400, cost: 0.000182, acc: 1.000000
-[Wed Oct 10 17:14:06 2018] epoch_id: 30, batch_id: 500, cost: 0.036089, acc: 0.984375
-[Wed Oct 10 17:14:08 2018] epoch_id: 30, batch_id: 600, cost: 0.003895, acc: 1.000000
-[Wed Oct 10 17:14:10 2018] epoch_id: 30, batch_id: 700, cost: 0.012125, acc: 0.992188
-[Wed Oct 10 17:14:13 2018] epoch_id: 30, batch_id: 800, cost: 0.007463, acc: 1.000000
-[Wed Oct 10 17:14:15 2018] epoch_id: 30, batch_id: 900, cost: 0.043093, acc: 0.992188
-[Wed Oct 10 17:14:17 2018] epoch_id: 30, batch_id: 1000, cost: 0.023025, acc: 0.992188
-[Wed Oct 10 17:14:20 2018] epoch_id: 30, batch_id: 1100, cost: 0.008640, acc: 0.992188
-[Wed Oct 10 17:14:22 2018] epoch_id: 30, batch_id: 1200, cost: 0.023361, acc: 0.984375
-[Wed Oct 10 17:14:24 2018] epoch_id: 30, batch_id: 1300, cost: 0.003226, acc: 1.000000
-[Wed Oct 10 17:14:27 2018] epoch_id: 30, batch_id: 1400, cost: 0.010225, acc: 0.992188
-[Wed Oct 10 17:14:29 2018] epoch_id: 30, batch_id: 1500, cost: 0.009733, acc: 1.000000
-[Wed Oct 10 17:14:31 2018] epoch_id: 30, batch_id: 1600, cost: 0.014048, acc: 0.992188
-[Wed Oct 10 17:14:34 2018] epoch_id: 30, batch_id: 1700, cost: 0.008200, acc: 1.000000
-[Wed Oct 10 17:14:36 2018] epoch_id: 30, batch_id: 1800, cost: 0.035217, acc: 0.992188
-[Wed Oct 10 17:14:38 2018] epoch_id: 30, batch_id: 1900, cost: 0.002707, acc: 1.000000
-[Wed Oct 10 17:14:40 2018] epoch_id: 30, batch_id: 2000, cost: 0.028292, acc: 0.984375
-[Wed Oct 10 17:14:43 2018] epoch_id: 30, batch_id: 2100, cost: 0.003164, acc: 1.000000
-[Wed Oct 10 17:14:45 2018] epoch_id: 30, batch_id: 2200, cost: 0.014421, acc: 0.992188
-[Wed Oct 10 17:14:47 2018] epoch_id: 30, batch_id: 2300, cost: 0.001986, acc: 1.000000
-[Wed Oct 10 17:14:49 2018] epoch_id: 30, batch_id: 2400, cost: 0.038462, acc: 0.992188
-[Wed Oct 10 17:14:52 2018] epoch_id: 30, batch_id: 2500, cost: 0.003580, acc: 1.000000
-[Wed Oct 10 17:14:54 2018] epoch_id: 30, batch_id: 2600, cost: 0.061259, acc: 0.984375
-[Wed Oct 10 17:14:56 2018] epoch_id: 30, batch_id: 2700, cost: 0.042758, acc: 0.992188
-[Wed Oct 10 17:14:59 2018] epoch_id: 30, batch_id: 2800, cost: 0.012991, acc: 0.992188
-[Wed Oct 10 17:15:02 2018] epoch_id: 30, batch_id: 2900, cost: 0.021263, acc: 0.992188
-[Wed Oct 10 17:15:04 2018] epoch_id: 30, batch_id: 3000, cost: 0.046058, acc: 0.992188
-
-[Wed Oct 10 17:15:05 2018] epoch_id: 30, train_avg_cost: 0.016908, train_avg_acc: 0.994391
-[Wed Oct 10 17:15:06 2018] epoch_id: 30, dev_cost: 1.214737, accuracy: 0.8343
-[Wed Oct 10 17:15:07 2018] epoch_id: 30, test_cost: 1.247275, accuracy: 0.828
-
-[Wed Oct 10 17:15:15 2018] epoch_id: 31, batch_id: 0, cost: 0.019613, acc: 0.992188
-[Wed Oct 10 17:15:17 2018] epoch_id: 31, batch_id: 100, cost: 0.048000, acc: 0.984375
-[Wed Oct 10 17:15:19 2018] epoch_id: 31, batch_id: 200, cost: 0.038604, acc: 0.992188
-[Wed Oct 10 17:15:21 2018] epoch_id: 31, batch_id: 300, cost: 0.003548, acc: 1.000000
-[Wed Oct 10 17:15:24 2018] epoch_id: 31, batch_id: 400, cost: 0.001539, acc: 1.000000
-[Wed Oct 10 17:15:26 2018] epoch_id: 31, batch_id: 500, cost: 0.034219, acc: 0.992188
-[Wed Oct 10 17:15:28 2018] epoch_id: 31, batch_id: 600, cost: 0.005696, acc: 1.000000
-[Wed Oct 10 17:15:31 2018] epoch_id: 31, batch_id: 700, cost: 0.012590, acc: 0.992188
-[Wed Oct 10 17:15:33 2018] epoch_id: 31, batch_id: 800, cost: 0.010021, acc: 0.992188
-[Wed Oct 10 17:15:35 2018] epoch_id: 31, batch_id: 900, cost: 0.004838, acc: 1.000000
-[Wed Oct 10 17:15:38 2018] epoch_id: 31, batch_id: 1000, cost: 0.006327, acc: 1.000000
-[Wed Oct 10 17:15:40 2018] epoch_id: 31, batch_id: 1100, cost: 0.019881, acc: 0.992188
-[Wed Oct 10 17:15:42 2018] epoch_id: 31, batch_id: 1200, cost: 0.006641, acc: 1.000000
-[Wed Oct 10 17:15:44 2018] epoch_id: 31, batch_id: 1300, cost: 0.014323, acc: 0.992188
-[Wed Oct 10 17:15:47 2018] epoch_id: 31, batch_id: 1400, cost: 0.008565, acc: 1.000000
-[Wed Oct 10 17:15:49 2018] epoch_id: 31, batch_id: 1500, cost: 0.003106, acc: 1.000000
-[Wed Oct 10 17:15:51 2018] epoch_id: 31, batch_id: 1600, cost: 0.023656, acc: 0.992188
-[Wed Oct 10 17:15:53 2018] epoch_id: 31, batch_id: 1700, cost: 0.014398, acc: 1.000000
-[Wed Oct 10 17:15:56 2018] epoch_id: 31, batch_id: 1800, cost: 0.005019, acc: 1.000000
-[Wed Oct 10 17:15:58 2018] epoch_id: 31, batch_id: 1900, cost: 0.042051, acc: 0.984375
-[Wed Oct 10 17:16:00 2018] epoch_id: 31, batch_id: 2000, cost: 0.005070, acc: 1.000000
-[Wed Oct 10 17:16:03 2018] epoch_id: 31, batch_id: 2100, cost: 0.071147, acc: 0.984375
-[Wed Oct 10 17:16:05 2018] epoch_id: 31, batch_id: 2200, cost: 0.004077, acc: 1.000000
-[Wed Oct 10 17:16:07 2018] epoch_id: 31, batch_id: 2300, cost: 0.000753, acc: 1.000000
-[Wed Oct 10 17:16:11 2018] epoch_id: 31, batch_id: 2400, cost: 0.007293, acc: 1.000000
-[Wed Oct 10 17:16:13 2018] epoch_id: 31, batch_id: 2500, cost: 0.020403, acc: 0.992188
-[Wed Oct 10 17:16:15 2018] epoch_id: 31, batch_id: 2600, cost: 0.002491, acc: 1.000000
-[Wed Oct 10 17:16:17 2018] epoch_id: 31, batch_id: 2700, cost: 0.001376, acc: 1.000000
-[Wed Oct 10 17:16:20 2018] epoch_id: 31, batch_id: 2800, cost: 0.006589, acc: 1.000000
-[Wed Oct 10 17:16:22 2018] epoch_id: 31, batch_id: 2900, cost: 0.009986, acc: 1.000000
-[Wed Oct 10 17:16:24 2018] epoch_id: 31, batch_id: 3000, cost: 0.004628, acc: 1.000000
-
-[Wed Oct 10 17:16:25 2018] epoch_id: 31, train_avg_cost: 0.016863, train_avg_acc: 0.994502
-[Wed Oct 10 17:16:26 2018] epoch_id: 31, dev_cost: 1.237226, accuracy: 0.8348
-[Wed Oct 10 17:16:27 2018] epoch_id: 31, test_cost: 1.256692, accuracy: 0.8327
-
-[Wed Oct 10 17:16:35 2018] epoch_id: 32, batch_id: 0, cost: 0.001936, acc: 1.000000
-[Wed Oct 10 17:16:37 2018] epoch_id: 32, batch_id: 100, cost: 0.002628, acc: 1.000000
-[Wed Oct 10 17:16:40 2018] epoch_id: 32, batch_id: 200, cost: 0.006948, acc: 1.000000
-[Wed Oct 10 17:16:42 2018] epoch_id: 32, batch_id: 300, cost: 0.001289, acc: 1.000000
-[Wed Oct 10 17:16:44 2018] epoch_id: 32, batch_id: 400, cost: 0.016850, acc: 1.000000
-[Wed Oct 10 17:16:46 2018] epoch_id: 32, batch_id: 500, cost: 0.001709, acc: 1.000000
-[Wed Oct 10 17:16:49 2018] epoch_id: 32, batch_id: 600, cost: 0.000500, acc: 1.000000
-[Wed Oct 10 17:16:51 2018] epoch_id: 32, batch_id: 700, cost: 0.026876, acc: 0.992188
-[Wed Oct 10 17:16:54 2018] epoch_id: 32, batch_id: 800, cost: 0.032499, acc: 0.992188
-[Wed Oct 10 17:16:56 2018] epoch_id: 32, batch_id: 900, cost: 0.008563, acc: 1.000000
-[Wed Oct 10 17:16:59 2018] epoch_id: 32, batch_id: 1000, cost: 0.033638, acc: 0.992188
-[Wed Oct 10 17:17:01 2018] epoch_id: 32, batch_id: 1100, cost: 0.021626, acc: 0.992188
-[Wed Oct 10 17:17:03 2018] epoch_id: 32, batch_id: 1200, cost: 0.035490, acc: 0.984375
-[Wed Oct 10 17:17:05 2018] epoch_id: 32, batch_id: 1300, cost: 0.064303, acc: 0.992188
-[Wed Oct 10 17:17:08 2018] epoch_id: 32, batch_id: 1400, cost: 0.000839, acc: 1.000000
-[Wed Oct 10 17:17:10 2018] epoch_id: 32, batch_id: 1500, cost: 0.014770, acc: 0.992188
-[Wed Oct 10 17:17:12 2018] epoch_id: 32, batch_id: 1600, cost: 0.067803, acc: 0.992188
-[Wed Oct 10 17:17:14 2018] epoch_id: 32, batch_id: 1700, cost: 0.001507, acc: 1.000000
-[Wed Oct 10 17:17:17 2018] epoch_id: 32, batch_id: 1800, cost: 0.039594, acc: 0.984375
-[Wed Oct 10 17:17:19 2018] epoch_id: 32, batch_id: 1900, cost: 0.016198, acc: 0.992188
-[Wed Oct 10 17:17:21 2018] epoch_id: 32, batch_id: 2000, cost: 0.027783, acc: 0.984375
-[Wed Oct 10 17:17:24 2018] epoch_id: 32, batch_id: 2100, cost: 0.010040, acc: 0.992188
-[Wed Oct 10 17:17:26 2018] epoch_id: 32, batch_id: 2200, cost: 0.043833, acc: 0.992188
-[Wed Oct 10 17:17:28 2018] epoch_id: 32, batch_id: 2300, cost: 0.012850, acc: 0.992188
-[Wed Oct 10 17:17:31 2018] epoch_id: 32, batch_id: 2400, cost: 0.010643, acc: 1.000000
-[Wed Oct 10 17:17:33 2018] epoch_id: 32, batch_id: 2500, cost: 0.013513, acc: 0.992188
-[Wed Oct 10 17:17:35 2018] epoch_id: 32, batch_id: 2600, cost: 0.021498, acc: 0.984375
-[Wed Oct 10 17:17:38 2018] epoch_id: 32, batch_id: 2700, cost: 0.048091, acc: 0.984375
-[Wed Oct 10 17:17:40 2018] epoch_id: 32, batch_id: 2800, cost: 0.054710, acc: 0.984375
-[Wed Oct 10 17:17:42 2018] epoch_id: 32, batch_id: 2900, cost: 0.028200, acc: 0.992188
-[Wed Oct 10 17:17:44 2018] epoch_id: 32, batch_id: 3000, cost: 0.052160, acc: 0.992188
-
-[Wed Oct 10 17:17:45 2018] epoch_id: 32, train_avg_cost: 0.016115, train_avg_acc: 0.994599
-[Wed Oct 10 17:17:46 2018] epoch_id: 32, dev_cost: 1.182178, accuracy: 0.8359
-[Wed Oct 10 17:17:47 2018] epoch_id: 32, test_cost: 1.183695, accuracy: 0.8297
-
-[Wed Oct 10 17:17:55 2018] epoch_id: 33, batch_id: 0, cost: 0.002170, acc: 1.000000
-[Wed Oct 10 17:17:58 2018] epoch_id: 33, batch_id: 100, cost: 0.000724, acc: 1.000000
-[Wed Oct 10 17:18:00 2018] epoch_id: 33, batch_id: 200, cost: 0.102036, acc: 0.968750
-[Wed Oct 10 17:18:02 2018] epoch_id: 33, batch_id: 300, cost: 0.006967, acc: 1.000000
-[Wed Oct 10 17:18:04 2018] epoch_id: 33, batch_id: 400, cost: 0.004401, acc: 1.000000
-[Wed Oct 10 17:18:07 2018] epoch_id: 33, batch_id: 500, cost: 0.006693, acc: 1.000000
-[Wed Oct 10 17:18:09 2018] epoch_id: 33, batch_id: 600, cost: 0.002759, acc: 1.000000
-[Wed Oct 10 17:18:11 2018] epoch_id: 33, batch_id: 700, cost: 0.000587, acc: 1.000000
-[Wed Oct 10 17:18:13 2018] epoch_id: 33, batch_id: 800, cost: 0.006432, acc: 1.000000
-[Wed Oct 10 17:18:16 2018] epoch_id: 33, batch_id: 900, cost: 0.043751, acc: 0.984375
-[Wed Oct 10 17:18:18 2018] epoch_id: 33, batch_id: 1000, cost: 0.006652, acc: 1.000000
-[Wed Oct 10 17:18:20 2018] epoch_id: 33, batch_id: 1100, cost: 0.008419, acc: 1.000000
-[Wed Oct 10 17:18:23 2018] epoch_id: 33, batch_id: 1200, cost: 0.012309, acc: 0.992188
-[Wed Oct 10 17:18:25 2018] epoch_id: 33, batch_id: 1300, cost: 0.023884, acc: 0.984375
-[Wed Oct 10 17:18:27 2018] epoch_id: 33, batch_id: 1400, cost: 0.011711, acc: 0.992188
-[Wed Oct 10 17:18:29 2018] epoch_id: 33, batch_id: 1500, cost: 0.005948, acc: 1.000000
-[Wed Oct 10 17:18:32 2018] epoch_id: 33, batch_id: 1600, cost: 0.014363, acc: 0.992188
-[Wed Oct 10 17:18:34 2018] epoch_id: 33, batch_id: 1700, cost: 0.000291, acc: 1.000000
-[Wed Oct 10 17:18:37 2018] epoch_id: 33, batch_id: 1800, cost: 0.005694, acc: 1.000000
-[Wed Oct 10 17:18:40 2018] epoch_id: 33, batch_id: 1900, cost: 0.170195, acc: 0.984375
-[Wed Oct 10 17:18:42 2018] epoch_id: 33, batch_id: 2000, cost: 0.001044, acc: 1.000000
-[Wed Oct 10 17:18:44 2018] epoch_id: 33, batch_id: 2100, cost: 0.004921, acc: 1.000000
-[Wed Oct 10 17:18:46 2018] epoch_id: 33, batch_id: 2200, cost: 0.006203, acc: 1.000000
-[Wed Oct 10 17:18:48 2018] epoch_id: 33, batch_id: 2300, cost: 0.038624, acc: 0.984375
-[Wed Oct 10 17:18:51 2018] epoch_id: 33, batch_id: 2400, cost: 0.067313, acc: 0.976562
-[Wed Oct 10 17:18:53 2018] epoch_id: 33, batch_id: 2500, cost: 0.040853, acc: 0.992188
-[Wed Oct 10 17:18:55 2018] epoch_id: 33, batch_id: 2600, cost: 0.039087, acc: 0.984375
-[Wed Oct 10 17:18:57 2018] epoch_id: 33, batch_id: 2700, cost: 0.004672, acc: 1.000000
-[Wed Oct 10 17:19:00 2018] epoch_id: 33, batch_id: 2800, cost: 0.021997, acc: 0.984375
-[Wed Oct 10 17:19:02 2018] epoch_id: 33, batch_id: 2900, cost: 0.013635, acc: 1.000000
-[Wed Oct 10 17:19:04 2018] epoch_id: 33, batch_id: 3000, cost: 0.009055, acc: 0.992188
-
-[Wed Oct 10 17:19:05 2018] epoch_id: 33, train_avg_cost: 0.014972, train_avg_acc: 0.995145
-[Wed Oct 10 17:19:06 2018] epoch_id: 33, dev_cost: 1.819085, accuracy: 0.8352
-[Wed Oct 10 17:19:07 2018] epoch_id: 33, test_cost: 1.859041, accuracy: 0.8314
-
-[Wed Oct 10 17:19:15 2018] epoch_id: 34, batch_id: 0, cost: 0.026821, acc: 0.992188
-[Wed Oct 10 17:19:17 2018] epoch_id: 34, batch_id: 100, cost: 0.001463, acc: 1.000000
-[Wed Oct 10 17:19:20 2018] epoch_id: 34, batch_id: 200, cost: 0.000579, acc: 1.000000
-[Wed Oct 10 17:19:22 2018] epoch_id: 34, batch_id: 300, cost: 0.000492, acc: 1.000000
-[Wed Oct 10 17:19:24 2018] epoch_id: 34, batch_id: 400, cost: 0.000671, acc: 1.000000
-[Wed Oct 10 17:19:26 2018] epoch_id: 34, batch_id: 500, cost: 0.007763, acc: 1.000000
-[Wed Oct 10 17:19:29 2018] epoch_id: 34, batch_id: 600, cost: 0.018827, acc: 0.992188
-[Wed Oct 10 17:19:31 2018] epoch_id: 34, batch_id: 700, cost: 0.004606, acc: 1.000000
-[Wed Oct 10 17:19:33 2018] epoch_id: 34, batch_id: 800, cost: 0.004697, acc: 1.000000
-[Wed Oct 10 17:19:35 2018] epoch_id: 34, batch_id: 900, cost: 0.003752, acc: 1.000000
-[Wed Oct 10 17:19:38 2018] epoch_id: 34, batch_id: 1000, cost: 0.003546, acc: 1.000000
-[Wed Oct 10 17:19:40 2018] epoch_id: 34, batch_id: 1100, cost: 0.003848, acc: 1.000000
-[Wed Oct 10 17:19:42 2018] epoch_id: 34, batch_id: 1200, cost: 0.010363, acc: 1.000000
-[Wed Oct 10 17:19:44 2018] epoch_id: 34, batch_id: 1300, cost: 0.013875, acc: 0.992188
-[Wed Oct 10 17:19:47 2018] epoch_id: 34, batch_id: 1400, cost: 0.009212, acc: 0.992188
-[Wed Oct 10 17:19:49 2018] epoch_id: 34, batch_id: 1500, cost: 0.047909, acc: 0.992188
-[Wed Oct 10 17:19:51 2018] epoch_id: 34, batch_id: 1600, cost: 0.012809, acc: 0.992188
-[Wed Oct 10 17:19:53 2018] epoch_id: 34, batch_id: 1700, cost: 0.009717, acc: 1.000000
-[Wed Oct 10 17:19:56 2018] epoch_id: 34, batch_id: 1800, cost: 0.026330, acc: 0.984375
-[Wed Oct 10 17:19:58 2018] epoch_id: 34, batch_id: 1900, cost: 0.016982, acc: 0.992188
-[Wed Oct 10 17:20:00 2018] epoch_id: 34, batch_id: 2000, cost: 0.021416, acc: 0.992188
-[Wed Oct 10 17:20:03 2018] epoch_id: 34, batch_id: 2100, cost: 0.001120, acc: 1.000000
-[Wed Oct 10 17:20:05 2018] epoch_id: 34, batch_id: 2200, cost: 0.011436, acc: 1.000000
-[Wed Oct 10 17:20:07 2018] epoch_id: 34, batch_id: 2300, cost: 0.007605, acc: 0.992188
-[Wed Oct 10 17:20:10 2018] epoch_id: 34, batch_id: 2400, cost: 0.026308, acc: 0.992188
-[Wed Oct 10 17:20:12 2018] epoch_id: 34, batch_id: 2500, cost: 0.006798, acc: 1.000000
-[Wed Oct 10 17:20:14 2018] epoch_id: 34, batch_id: 2600, cost: 0.017334, acc: 0.992188
-[Wed Oct 10 17:20:16 2018] epoch_id: 34, batch_id: 2700, cost: 0.030094, acc: 0.992188
-[Wed Oct 10 17:20:18 2018] epoch_id: 34, batch_id: 2800, cost: 0.053259, acc: 0.992188
-[Wed Oct 10 17:20:21 2018] epoch_id: 34, batch_id: 2900, cost: 0.061547, acc: 0.968750
-[Wed Oct 10 17:20:23 2018] epoch_id: 34, batch_id: 3000, cost: 0.002864, acc: 1.000000
-
-[Wed Oct 10 17:20:24 2018] epoch_id: 34, train_avg_cost: 0.014813, train_avg_acc: 0.995064
-[Wed Oct 10 17:20:25 2018] epoch_id: 34, dev_cost: 1.697732, accuracy: 0.8346
-[Wed Oct 10 17:20:26 2018] epoch_id: 34, test_cost: 1.721137, accuracy: 0.8341
-
-[Wed Oct 10 17:20:34 2018] epoch_id: 35, batch_id: 0, cost: 0.000268, acc: 1.000000
-[Wed Oct 10 17:20:37 2018] epoch_id: 35, batch_id: 100, cost: 0.001389, acc: 1.000000
-[Wed Oct 10 17:20:39 2018] epoch_id: 35, batch_id: 200, cost: 0.003275, acc: 1.000000
-[Wed Oct 10 17:20:41 2018] epoch_id: 35, batch_id: 300, cost: 0.006535, acc: 1.000000
-[Wed Oct 10 17:20:43 2018] epoch_id: 35, batch_id: 400, cost: 0.005316, acc: 1.000000
-[Wed Oct 10 17:20:45 2018] epoch_id: 35, batch_id: 500, cost: 0.017976, acc: 0.992188
-[Wed Oct 10 17:20:48 2018] epoch_id: 35, batch_id: 600, cost: 0.060320, acc: 0.992188
-[Wed Oct 10 17:20:50 2018] epoch_id: 35, batch_id: 700, cost: 0.004358, acc: 1.000000
-[Wed Oct 10 17:20:52 2018] epoch_id: 35, batch_id: 800, cost: 0.003560, acc: 1.000000
-[Wed Oct 10 17:20:55 2018] epoch_id: 35, batch_id: 900, cost: 0.017978, acc: 0.992188
-[Wed Oct 10 17:20:57 2018] epoch_id: 35, batch_id: 1000, cost: 0.007025, acc: 1.000000
-[Wed Oct 10 17:20:59 2018] epoch_id: 35, batch_id: 1100, cost: 0.008777, acc: 0.992188
-[Wed Oct 10 17:21:01 2018] epoch_id: 35, batch_id: 1200, cost: 0.006591, acc: 1.000000
-[Wed Oct 10 17:21:04 2018] epoch_id: 35, batch_id: 1300, cost: 0.008911, acc: 0.992188
-[Wed Oct 10 17:21:06 2018] epoch_id: 35, batch_id: 1400, cost: 0.038343, acc: 0.984375
-[Wed Oct 10 17:21:08 2018] epoch_id: 35, batch_id: 1500, cost: 0.001654, acc: 1.000000
-[Wed Oct 10 17:21:10 2018] epoch_id: 35, batch_id: 1600, cost: 0.002577, acc: 1.000000
-[Wed Oct 10 17:21:13 2018] epoch_id: 35, batch_id: 1700, cost: 0.026908, acc: 0.992188
-[Wed Oct 10 17:21:15 2018] epoch_id: 35, batch_id: 1800, cost: 0.024004, acc: 0.992188
-[Wed Oct 10 17:21:17 2018] epoch_id: 35, batch_id: 1900, cost: 0.013134, acc: 0.992188
-[Wed Oct 10 17:21:19 2018] epoch_id: 35, batch_id: 2000, cost: 0.003633, acc: 1.000000
-[Wed Oct 10 17:21:21 2018] epoch_id: 35, batch_id: 2100, cost: 0.011727, acc: 0.992188
-[Wed Oct 10 17:21:24 2018] epoch_id: 35, batch_id: 2200, cost: 0.019991, acc: 0.992188
-[Wed Oct 10 17:21:26 2018] epoch_id: 35, batch_id: 2300, cost: 0.004771, acc: 1.000000
-[Wed Oct 10 17:21:28 2018] epoch_id: 35, batch_id: 2400, cost: 0.013732, acc: 0.992188
-[Wed Oct 10 17:21:30 2018] epoch_id: 35, batch_id: 2500, cost: 0.096741, acc: 0.984375
-[Wed Oct 10 17:21:33 2018] epoch_id: 35, batch_id: 2600, cost: 0.006102, acc: 1.000000
-[Wed Oct 10 17:21:36 2018] epoch_id: 35, batch_id: 2700, cost: 0.007046, acc: 0.992188
-[Wed Oct 10 17:21:38 2018] epoch_id: 35, batch_id: 2800, cost: 0.028777, acc: 0.984375
-[Wed Oct 10 17:21:41 2018] epoch_id: 35, batch_id: 2900, cost: 0.116960, acc: 0.976562
-[Wed Oct 10 17:21:43 2018] epoch_id: 35, batch_id: 3000, cost: 0.039752, acc: 0.968750
-
-[Wed Oct 10 17:21:44 2018] epoch_id: 35, train_avg_cost: 0.014921, train_avg_acc: 0.995075
-[Wed Oct 10 17:21:45 2018] epoch_id: 35, dev_cost: 1.203598, accuracy: 0.8348
-[Wed Oct 10 17:21:45 2018] epoch_id: 35, test_cost: 1.205202, accuracy: 0.8347
-
-[Wed Oct 10 17:21:54 2018] epoch_id: 36, batch_id: 0, cost: 0.009331, acc: 1.000000
-[Wed Oct 10 17:21:56 2018] epoch_id: 36, batch_id: 100, cost: 0.004473, acc: 1.000000
-[Wed Oct 10 17:21:58 2018] epoch_id: 36, batch_id: 200, cost: 0.001097, acc: 1.000000
-[Wed Oct 10 17:22:00 2018] epoch_id: 36, batch_id: 300, cost: 0.001914, acc: 1.000000
-[Wed Oct 10 17:22:03 2018] epoch_id: 36, batch_id: 400, cost: 0.003967, acc: 1.000000
-[Wed Oct 10 17:22:05 2018] epoch_id: 36, batch_id: 500, cost: 0.008101, acc: 1.000000
-[Wed Oct 10 17:22:07 2018] epoch_id: 36, batch_id: 600, cost: 0.037581, acc: 0.976562
-[Wed Oct 10 17:22:09 2018] epoch_id: 36, batch_id: 700, cost: 0.031872, acc: 0.992188
-[Wed Oct 10 17:22:11 2018] epoch_id: 36, batch_id: 800, cost: 0.002586, acc: 1.000000
-[Wed Oct 10 17:22:14 2018] epoch_id: 36, batch_id: 900, cost: 0.025838, acc: 0.984375
-[Wed Oct 10 17:22:16 2018] epoch_id: 36, batch_id: 1000, cost: 0.012382, acc: 0.992188
-[Wed Oct 10 17:22:18 2018] epoch_id: 36, batch_id: 1100, cost: 0.006482, acc: 1.000000
-[Wed Oct 10 17:22:20 2018] epoch_id: 36, batch_id: 1200, cost: 0.006437, acc: 1.000000
-[Wed Oct 10 17:22:23 2018] epoch_id: 36, batch_id: 1300, cost: 0.026039, acc: 0.992188
-[Wed Oct 10 17:22:25 2018] epoch_id: 36, batch_id: 1400, cost: 0.017908, acc: 0.992188
-[Wed Oct 10 17:22:27 2018] epoch_id: 36, batch_id: 1500, cost: 0.025722, acc: 0.984375
-[Wed Oct 10 17:22:29 2018] epoch_id: 36, batch_id: 1600, cost: 0.031398, acc: 0.992188
-[Wed Oct 10 17:22:32 2018] epoch_id: 36, batch_id: 1700, cost: 0.034194, acc: 0.984375
-[Wed Oct 10 17:22:34 2018] epoch_id: 36, batch_id: 1800, cost: 0.001353, acc: 1.000000
-[Wed Oct 10 17:22:36 2018] epoch_id: 36, batch_id: 1900, cost: 0.000942, acc: 1.000000
-[Wed Oct 10 17:22:38 2018] epoch_id: 36, batch_id: 2000, cost: 0.004051, acc: 1.000000
-[Wed Oct 10 17:22:40 2018] epoch_id: 36, batch_id: 2100, cost: 0.016359, acc: 0.992188
-[Wed Oct 10 17:22:43 2018] epoch_id: 36, batch_id: 2200, cost: 0.010324, acc: 1.000000
-[Wed Oct 10 17:22:46 2018] epoch_id: 36, batch_id: 2300, cost: 0.015250, acc: 1.000000
-[Wed Oct 10 17:22:48 2018] epoch_id: 36, batch_id: 2400, cost: 0.053711, acc: 0.976562
-[Wed Oct 10 17:22:51 2018] epoch_id: 36, batch_id: 2500, cost: 0.059409, acc: 0.984375
-[Wed Oct 10 17:22:53 2018] epoch_id: 36, batch_id: 2600, cost: 0.009707, acc: 1.000000
-[Wed Oct 10 17:22:55 2018] epoch_id: 36, batch_id: 2700, cost: 0.003367, acc: 1.000000
-[Wed Oct 10 17:22:58 2018] epoch_id: 36, batch_id: 2800, cost: 0.001207, acc: 1.000000
-[Wed Oct 10 17:23:00 2018] epoch_id: 36, batch_id: 2900, cost: 0.009538, acc: 0.992188
-[Wed Oct 10 17:23:02 2018] epoch_id: 36, batch_id: 3000, cost: 0.013745, acc: 0.992188
-
-[Wed Oct 10 17:23:03 2018] epoch_id: 36, train_avg_cost: 0.014009, train_avg_acc: 0.995522
-[Wed Oct 10 17:23:04 2018] epoch_id: 36, dev_cost: 1.647745, accuracy: 0.8324
-[Wed Oct 10 17:23:05 2018] epoch_id: 36, test_cost: 1.662931, accuracy: 0.8368
-
-[Wed Oct 10 17:23:13 2018] epoch_id: 37, batch_id: 0, cost: 0.009128, acc: 1.000000
-[Wed Oct 10 17:23:15 2018] epoch_id: 37, batch_id: 100, cost: 0.000989, acc: 1.000000
-[Wed Oct 10 17:23:17 2018] epoch_id: 37, batch_id: 200, cost: 0.031867, acc: 0.992188
-[Wed Oct 10 17:23:20 2018] epoch_id: 37, batch_id: 300, cost: 0.016197, acc: 0.984375
-[Wed Oct 10 17:23:22 2018] epoch_id: 37, batch_id: 400, cost: 0.004157, acc: 1.000000
-[Wed Oct 10 17:23:24 2018] epoch_id: 37, batch_id: 500, cost: 0.004215, acc: 1.000000
-[Wed Oct 10 17:23:26 2018] epoch_id: 37, batch_id: 600, cost: 0.000303, acc: 1.000000
-[Wed Oct 10 17:23:29 2018] epoch_id: 37, batch_id: 700, cost: 0.005056, acc: 1.000000
-[Wed Oct 10 17:23:31 2018] epoch_id: 37, batch_id: 800, cost: 0.016816, acc: 0.992188
-[Wed Oct 10 17:23:34 2018] epoch_id: 37, batch_id: 900, cost: 0.036067, acc: 0.984375
-[Wed Oct 10 17:23:37 2018] epoch_id: 37, batch_id: 1000, cost: 0.002430, acc: 1.000000
-[Wed Oct 10 17:23:39 2018] epoch_id: 37, batch_id: 1100, cost: 0.001621, acc: 1.000000
-[Wed Oct 10 17:23:41 2018] epoch_id: 37, batch_id: 1200, cost: 0.034505, acc: 0.992188
-[Wed Oct 10 17:23:43 2018] epoch_id: 37, batch_id: 1300, cost: 0.008605, acc: 0.992188
-[Wed Oct 10 17:23:45 2018] epoch_id: 37, batch_id: 1400, cost: 0.039387, acc: 0.984375
-[Wed Oct 10 17:23:48 2018] epoch_id: 37, batch_id: 1500, cost: 0.005761, acc: 1.000000
-[Wed Oct 10 17:23:50 2018] epoch_id: 37, batch_id: 1600, cost: 0.002905, acc: 1.000000
-[Wed Oct 10 17:23:52 2018] epoch_id: 37, batch_id: 1700, cost: 0.009640, acc: 1.000000
-[Wed Oct 10 17:23:55 2018] epoch_id: 37, batch_id: 1800, cost: 0.004734, acc: 1.000000
-[Wed Oct 10 17:23:57 2018] epoch_id: 37, batch_id: 1900, cost: 0.029191, acc: 0.992188
-[Wed Oct 10 17:23:59 2018] epoch_id: 37, batch_id: 2000, cost: 0.000724, acc: 1.000000
-[Wed Oct 10 17:24:01 2018] epoch_id: 37, batch_id: 2100, cost: 0.014325, acc: 0.992188
-[Wed Oct 10 17:24:04 2018] epoch_id: 37, batch_id: 2200, cost: 0.004239, acc: 1.000000
-[Wed Oct 10 17:24:06 2018] epoch_id: 37, batch_id: 2300, cost: 0.000597, acc: 1.000000
-[Wed Oct 10 17:24:08 2018] epoch_id: 37, batch_id: 2400, cost: 0.008226, acc: 1.000000
-[Wed Oct 10 17:24:10 2018] epoch_id: 37, batch_id: 2500, cost: 0.001601, acc: 1.000000
-[Wed Oct 10 17:24:12 2018] epoch_id: 37, batch_id: 2600, cost: 0.014527, acc: 0.992188
-[Wed Oct 10 17:24:15 2018] epoch_id: 37, batch_id: 2700, cost: 0.010813, acc: 0.992188
-[Wed Oct 10 17:24:17 2018] epoch_id: 37, batch_id: 2800, cost: 0.015832, acc: 0.992188
-[Wed Oct 10 17:24:19 2018] epoch_id: 37, batch_id: 2900, cost: 0.063636, acc: 0.976562
-[Wed Oct 10 17:24:22 2018] epoch_id: 37, batch_id: 3000, cost: 0.003993, acc: 1.000000
-
-[Wed Oct 10 17:24:22 2018] epoch_id: 37, train_avg_cost: 0.014056, train_avg_acc: 0.995431
-[Wed Oct 10 17:24:23 2018] epoch_id: 37, dev_cost: 1.500988, accuracy: 0.8334
-[Wed Oct 10 17:24:24 2018] epoch_id: 37, test_cost: 1.491400, accuracy: 0.8327
-
-[Wed Oct 10 17:24:33 2018] epoch_id: 38, batch_id: 0, cost: 0.016895, acc: 0.992188
-[Wed Oct 10 17:24:35 2018] epoch_id: 38, batch_id: 100, cost: 0.001690, acc: 1.000000
-[Wed Oct 10 17:24:38 2018] epoch_id: 38, batch_id: 200, cost: 0.009989, acc: 1.000000
-[Wed Oct 10 17:24:40 2018] epoch_id: 38, batch_id: 300, cost: 0.023480, acc: 0.984375
-[Wed Oct 10 17:24:42 2018] epoch_id: 38, batch_id: 400, cost: 0.004687, acc: 1.000000
-[Wed Oct 10 17:24:45 2018] epoch_id: 38, batch_id: 500, cost: 0.020183, acc: 0.992188
-[Wed Oct 10 17:24:47 2018] epoch_id: 38, batch_id: 600, cost: 0.028614, acc: 0.992188
-[Wed Oct 10 17:24:49 2018] epoch_id: 38, batch_id: 700, cost: 0.000448, acc: 1.000000
-[Wed Oct 10 17:24:51 2018] epoch_id: 38, batch_id: 800, cost: 0.000913, acc: 1.000000
-[Wed Oct 10 17:24:54 2018] epoch_id: 38, batch_id: 900, cost: 0.022090, acc: 0.992188
-[Wed Oct 10 17:24:56 2018] epoch_id: 38, batch_id: 1000, cost: 0.006918, acc: 0.992188
-[Wed Oct 10 17:24:58 2018] epoch_id: 38, batch_id: 1100, cost: 0.028611, acc: 0.984375
-[Wed Oct 10 17:25:00 2018] epoch_id: 38, batch_id: 1200, cost: 0.013097, acc: 0.992188
-[Wed Oct 10 17:25:03 2018] epoch_id: 38, batch_id: 1300, cost: 0.014227, acc: 0.992188
-[Wed Oct 10 17:25:05 2018] epoch_id: 38, batch_id: 1400, cost: 0.033064, acc: 0.992188
-[Wed Oct 10 17:25:07 2018] epoch_id: 38, batch_id: 1500, cost: 0.004276, acc: 1.000000
-[Wed Oct 10 17:25:09 2018] epoch_id: 38, batch_id: 1600, cost: 0.016516, acc: 0.992188
-[Wed Oct 10 17:25:12 2018] epoch_id: 38, batch_id: 1700, cost: 0.004443, acc: 1.000000
-[Wed Oct 10 17:25:14 2018] epoch_id: 38, batch_id: 1800, cost: 0.001648, acc: 1.000000
-[Wed Oct 10 17:25:17 2018] epoch_id: 38, batch_id: 1900, cost: 0.026780, acc: 0.992188
-[Wed Oct 10 17:25:20 2018] epoch_id: 38, batch_id: 2000, cost: 0.006375, acc: 0.992188
-[Wed Oct 10 17:25:22 2018] epoch_id: 38, batch_id: 2100, cost: 0.013131, acc: 0.992188
-[Wed Oct 10 17:25:24 2018] epoch_id: 38, batch_id: 2200, cost: 0.012666, acc: 1.000000
-[Wed Oct 10 17:25:26 2018] epoch_id: 38, batch_id: 2300, cost: 0.001973, acc: 1.000000
-[Wed Oct 10 17:25:29 2018] epoch_id: 38, batch_id: 2400, cost: 0.005966, acc: 1.000000
-[Wed Oct 10 17:25:31 2018] epoch_id: 38, batch_id: 2500, cost: 0.011249, acc: 0.992188
-[Wed Oct 10 17:25:33 2018] epoch_id: 38, batch_id: 2600, cost: 0.022209, acc: 0.992188
-[Wed Oct 10 17:25:36 2018] epoch_id: 38, batch_id: 2700, cost: 0.003999, acc: 1.000000
-[Wed Oct 10 17:25:38 2018] epoch_id: 38, batch_id: 2800, cost: 0.010264, acc: 0.992188
-[Wed Oct 10 17:25:40 2018] epoch_id: 38, batch_id: 2900, cost: 0.003841, acc: 1.000000
-[Wed Oct 10 17:25:42 2018] epoch_id: 38, batch_id: 3000, cost: 0.075514, acc: 0.992188
-
-[Wed Oct 10 17:25:43 2018] epoch_id: 38, train_avg_cost: 0.013573, train_avg_acc: 0.995548
-[Wed Oct 10 17:25:44 2018] epoch_id: 38, dev_cost: 1.577028, accuracy: 0.8317
-[Wed Oct 10 17:25:45 2018] epoch_id: 38, test_cost: 1.546861, accuracy: 0.8363
-
-[Wed Oct 10 17:25:54 2018] epoch_id: 39, batch_id: 0, cost: 0.000487, acc: 1.000000
-[Wed Oct 10 17:25:56 2018] epoch_id: 39, batch_id: 100, cost: 0.003988, acc: 1.000000
-[Wed Oct 10 17:25:58 2018] epoch_id: 39, batch_id: 200, cost: 0.069709, acc: 0.984375
-[Wed Oct 10 17:26:00 2018] epoch_id: 39, batch_id: 300, cost: 0.031796, acc: 0.992188
-[Wed Oct 10 17:26:03 2018] epoch_id: 39, batch_id: 400, cost: 0.007788, acc: 1.000000
-[Wed Oct 10 17:26:05 2018] epoch_id: 39, batch_id: 500, cost: 0.014854, acc: 0.992188
-[Wed Oct 10 17:26:07 2018] epoch_id: 39, batch_id: 600, cost: 0.017382, acc: 0.992188
-[Wed Oct 10 17:26:09 2018] epoch_id: 39, batch_id: 700, cost: 0.003342, acc: 1.000000
-[Wed Oct 10 17:26:12 2018] epoch_id: 39, batch_id: 800, cost: 0.003279, acc: 1.000000
-[Wed Oct 10 17:26:14 2018] epoch_id: 39, batch_id: 900, cost: 0.018283, acc: 0.992188
-[Wed Oct 10 17:26:16 2018] epoch_id: 39, batch_id: 1000, cost: 0.000697, acc: 1.000000
-[Wed Oct 10 17:26:18 2018] epoch_id: 39, batch_id: 1100, cost: 0.003188, acc: 1.000000
-[Wed Oct 10 17:26:21 2018] epoch_id: 39, batch_id: 1200, cost: 0.002884, acc: 1.000000
-[Wed Oct 10 17:26:23 2018] epoch_id: 39, batch_id: 1300, cost: 0.016443, acc: 0.992188
-[Wed Oct 10 17:26:25 2018] epoch_id: 39, batch_id: 1400, cost: 0.036063, acc: 0.992188
-[Wed Oct 10 17:26:28 2018] epoch_id: 39, batch_id: 1500, cost: 0.010849, acc: 0.992188
-[Wed Oct 10 17:26:30 2018] epoch_id: 39, batch_id: 1600, cost: 0.002218, acc: 1.000000
-[Wed Oct 10 17:26:32 2018] epoch_id: 39, batch_id: 1700, cost: 0.011184, acc: 1.000000
-[Wed Oct 10 17:26:34 2018] epoch_id: 39, batch_id: 1800, cost: 0.002410, acc: 1.000000
-[Wed Oct 10 17:26:37 2018] epoch_id: 39, batch_id: 1900, cost: 0.010422, acc: 0.992188
-[Wed Oct 10 17:26:39 2018] epoch_id: 39, batch_id: 2000, cost: 0.012162, acc: 0.992188
-[Wed Oct 10 17:26:41 2018] epoch_id: 39, batch_id: 2100, cost: 0.042420, acc: 0.984375
-[Wed Oct 10 17:26:43 2018] epoch_id: 39, batch_id: 2200, cost: 0.006210, acc: 1.000000
-[Wed Oct 10 17:26:46 2018] epoch_id: 39, batch_id: 2300, cost: 0.002905, acc: 1.000000
-[Wed Oct 10 17:26:48 2018] epoch_id: 39, batch_id: 2400, cost: 0.067472, acc: 0.992188
-[Wed Oct 10 17:26:50 2018] epoch_id: 39, batch_id: 2500, cost: 0.030382, acc: 0.992188
-[Wed Oct 10 17:26:52 2018] epoch_id: 39, batch_id: 2600, cost: 0.049727, acc: 0.992188
-[Wed Oct 10 17:26:54 2018] epoch_id: 39, batch_id: 2700, cost: 0.024157, acc: 0.984375
-[Wed Oct 10 17:26:57 2018] epoch_id: 39, batch_id: 2800, cost: 0.021991, acc: 0.992188
-[Wed Oct 10 17:26:59 2018] epoch_id: 39, batch_id: 2900, cost: 0.001997, acc: 1.000000
-[Wed Oct 10 17:27:01 2018] epoch_id: 39, batch_id: 3000, cost: 0.001907, acc: 1.000000
-
-[Wed Oct 10 17:27:02 2018] epoch_id: 39, train_avg_cost: 0.012756, train_avg_acc: 0.995835
-[Wed Oct 10 17:27:03 2018] epoch_id: 39, dev_cost: 1.650582, accuracy: 0.8342
-[Wed Oct 10 17:27:04 2018] epoch_id: 39, test_cost: 1.662477, accuracy: 0.8325
-
-[Wed Oct 10 17:27:12 2018] epoch_id: 40, batch_id: 0, cost: 0.000858, acc: 1.000000
-[Wed Oct 10 17:27:15 2018] epoch_id: 40, batch_id: 100, cost: 0.000849, acc: 1.000000
-[Wed Oct 10 17:27:17 2018] epoch_id: 40, batch_id: 200, cost: 0.016273, acc: 0.992188
-[Wed Oct 10 17:27:19 2018] epoch_id: 40, batch_id: 300, cost: 0.042659, acc: 0.992188
-[Wed Oct 10 17:27:21 2018] epoch_id: 40, batch_id: 400, cost: 0.010672, acc: 0.992188
-[Wed Oct 10 17:27:24 2018] epoch_id: 40, batch_id: 500, cost: 0.000544, acc: 1.000000
-[Wed Oct 10 17:27:26 2018] epoch_id: 40, batch_id: 600, cost: 0.005578, acc: 1.000000
-[Wed Oct 10 17:27:28 2018] epoch_id: 40, batch_id: 700, cost: 0.039266, acc: 0.992188
-[Wed Oct 10 17:27:31 2018] epoch_id: 40, batch_id: 800, cost: 0.013144, acc: 0.992188
-[Wed Oct 10 17:27:33 2018] epoch_id: 40, batch_id: 900, cost: 0.000740, acc: 1.000000
-[Wed Oct 10 17:27:35 2018] epoch_id: 40, batch_id: 1000, cost: 0.003259, acc: 1.000000
-[Wed Oct 10 17:27:37 2018] epoch_id: 40, batch_id: 1100, cost: 0.002126, acc: 1.000000
-[Wed Oct 10 17:27:40 2018] epoch_id: 40, batch_id: 1200, cost: 0.003089, acc: 1.000000
-[Wed Oct 10 17:27:42 2018] epoch_id: 40, batch_id: 1300, cost: 0.000690, acc: 1.000000
-[Wed Oct 10 17:27:44 2018] epoch_id: 40, batch_id: 1400, cost: 0.000283, acc: 1.000000
-[Wed Oct 10 17:27:46 2018] epoch_id: 40, batch_id: 1500, cost: 0.013878, acc: 0.984375
-[Wed Oct 10 17:27:49 2018] epoch_id: 40, batch_id: 1600, cost: 0.005389, acc: 1.000000
-[Wed Oct 10 17:27:51 2018] epoch_id: 40, batch_id: 1700, cost: 0.024631, acc: 0.992188
-[Wed Oct 10 17:27:53 2018] epoch_id: 40, batch_id: 1800, cost: 0.003978, acc: 1.000000
-[Wed Oct 10 17:27:55 2018] epoch_id: 40, batch_id: 1900, cost: 0.004993, acc: 1.000000
-[Wed Oct 10 17:27:58 2018] epoch_id: 40, batch_id: 2000, cost: 0.014580, acc: 0.984375
-[Wed Oct 10 17:28:00 2018] epoch_id: 40, batch_id: 2100, cost: 0.003148, acc: 1.000000
-[Wed Oct 10 17:28:02 2018] epoch_id: 40, batch_id: 2200, cost: 0.000848, acc: 1.000000
-[Wed Oct 10 17:28:04 2018] epoch_id: 40, batch_id: 2300, cost: 0.009250, acc: 1.000000
-[Wed Oct 10 17:28:06 2018] epoch_id: 40, batch_id: 2400, cost: 0.006138, acc: 1.000000
-[Wed Oct 10 17:28:09 2018] epoch_id: 40, batch_id: 2500, cost: 0.050052, acc: 0.984375
-[Wed Oct 10 17:28:11 2018] epoch_id: 40, batch_id: 2600, cost: 0.005259, acc: 1.000000
-[Wed Oct 10 17:28:13 2018] epoch_id: 40, batch_id: 2700, cost: 0.027375, acc: 0.984375
-[Wed Oct 10 17:28:17 2018] epoch_id: 40, batch_id: 2800, cost: 0.010132, acc: 0.992188
-[Wed Oct 10 17:28:19 2018] epoch_id: 40, batch_id: 2900, cost: 0.003442, acc: 1.000000
-[Wed Oct 10 17:28:21 2018] epoch_id: 40, batch_id: 3000, cost: 0.005328, acc: 1.000000
-
-[Wed Oct 10 17:28:22 2018] epoch_id: 40, train_avg_cost: 0.013034, train_avg_acc: 0.995832
-[Wed Oct 10 17:28:23 2018] epoch_id: 40, dev_cost: 1.424795, accuracy: 0.8311
-[Wed Oct 10 17:28:24 2018] epoch_id: 40, test_cost: 1.404285, accuracy: 0.8345
-
-[Wed Oct 10 17:28:32 2018] epoch_id: 41, batch_id: 0, cost: 0.023169, acc: 0.992188
-[Wed Oct 10 17:28:34 2018] epoch_id: 41, batch_id: 100, cost: 0.008356, acc: 0.992188
-[Wed Oct 10 17:28:36 2018] epoch_id: 41, batch_id: 200, cost: 0.034033, acc: 0.992188
-[Wed Oct 10 17:28:39 2018] epoch_id: 41, batch_id: 300, cost: 0.003154, acc: 1.000000
-[Wed Oct 10 17:28:41 2018] epoch_id: 41, batch_id: 400, cost: 0.000178, acc: 1.000000
-[Wed Oct 10 17:28:43 2018] epoch_id: 41, batch_id: 500, cost: 0.001488, acc: 1.000000
-[Wed Oct 10 17:28:45 2018] epoch_id: 41, batch_id: 600, cost: 0.034724, acc: 0.992188
-[Wed Oct 10 17:28:48 2018] epoch_id: 41, batch_id: 700, cost: 0.011531, acc: 0.992188
-[Wed Oct 10 17:28:50 2018] epoch_id: 41, batch_id: 800, cost: 0.003504, acc: 1.000000
-[Wed Oct 10 17:28:52 2018] epoch_id: 41, batch_id: 900, cost: 0.010360, acc: 0.992188
-[Wed Oct 10 17:28:54 2018] epoch_id: 41, batch_id: 1000, cost: 0.014474, acc: 0.992188
-[Wed Oct 10 17:28:57 2018] epoch_id: 41, batch_id: 1100, cost: 0.005857, acc: 1.000000
-[Wed Oct 10 17:28:59 2018] epoch_id: 41, batch_id: 1200, cost: 0.007621, acc: 0.992188
-[Wed Oct 10 17:29:01 2018] epoch_id: 41, batch_id: 1300, cost: 0.013386, acc: 0.992188
-[Wed Oct 10 17:29:03 2018] epoch_id: 41, batch_id: 1400, cost: 0.004675, acc: 1.000000
-[Wed Oct 10 17:29:05 2018] epoch_id: 41, batch_id: 1500, cost: 0.023563, acc: 0.984375
-[Wed Oct 10 17:29:07 2018] epoch_id: 41, batch_id: 1600, cost: 0.001719, acc: 1.000000
-[Wed Oct 10 17:29:10 2018] epoch_id: 41, batch_id: 1700, cost: 0.000334, acc: 1.000000
-[Wed Oct 10 17:29:12 2018] epoch_id: 41, batch_id: 1800, cost: 0.001468, acc: 1.000000
-[Wed Oct 10 17:29:14 2018] epoch_id: 41, batch_id: 1900, cost: 0.002295, acc: 1.000000
-[Wed Oct 10 17:29:16 2018] epoch_id: 41, batch_id: 2000, cost: 0.021738, acc: 0.984375
-[Wed Oct 10 17:29:19 2018] epoch_id: 41, batch_id: 2100, cost: 0.023329, acc: 0.984375
-[Wed Oct 10 17:29:21 2018] epoch_id: 41, batch_id: 2200, cost: 0.005678, acc: 1.000000
-[Wed Oct 10 17:29:23 2018] epoch_id: 41, batch_id: 2300, cost: 0.004800, acc: 1.000000
-[Wed Oct 10 17:29:27 2018] epoch_id: 41, batch_id: 2400, cost: 0.007035, acc: 1.000000
-[Wed Oct 10 17:29:29 2018] epoch_id: 41, batch_id: 2500, cost: 0.041456, acc: 0.976562
-[Wed Oct 10 17:29:31 2018] epoch_id: 41, batch_id: 2600, cost: 0.011735, acc: 0.992188
-[Wed Oct 10 17:29:33 2018] epoch_id: 41, batch_id: 2700, cost: 0.016611, acc: 0.992188
-[Wed Oct 10 17:29:36 2018] epoch_id: 41, batch_id: 2800, cost: 0.004084, acc: 1.000000
-[Wed Oct 10 17:29:38 2018] epoch_id: 41, batch_id: 2900, cost: 0.001111, acc: 1.000000
-[Wed Oct 10 17:29:40 2018] epoch_id: 41, batch_id: 3000, cost: 0.015571, acc: 0.992188
-
-[Wed Oct 10 17:29:41 2018] epoch_id: 41, train_avg_cost: 0.012473, train_avg_acc: 0.995959
-[Wed Oct 10 17:29:42 2018] epoch_id: 41, dev_cost: 1.301212, accuracy: 0.8313
-[Wed Oct 10 17:29:43 2018] epoch_id: 41, test_cost: 1.292132, accuracy: 0.8314
-
-[Wed Oct 10 17:29:51 2018] epoch_id: 42, batch_id: 0, cost: 0.006710, acc: 1.000000
-[Wed Oct 10 17:29:53 2018] epoch_id: 42, batch_id: 100, cost: 0.003760, acc: 1.000000
-[Wed Oct 10 17:29:56 2018] epoch_id: 42, batch_id: 200, cost: 0.007728, acc: 1.000000
-[Wed Oct 10 17:29:58 2018] epoch_id: 42, batch_id: 300, cost: 0.010997, acc: 1.000000
-[Wed Oct 10 17:30:00 2018] epoch_id: 42, batch_id: 400, cost: 0.015313, acc: 0.984375
-[Wed Oct 10 17:30:02 2018] epoch_id: 42, batch_id: 500, cost: 0.000985, acc: 1.000000
-[Wed Oct 10 17:30:05 2018] epoch_id: 42, batch_id: 600, cost: 0.001277, acc: 1.000000
-[Wed Oct 10 17:30:07 2018] epoch_id: 42, batch_id: 700, cost: 0.002231, acc: 1.000000
-[Wed Oct 10 17:30:10 2018] epoch_id: 42, batch_id: 800, cost: 0.002233, acc: 1.000000
-[Wed Oct 10 17:30:12 2018] epoch_id: 42, batch_id: 900, cost: 0.002083, acc: 1.000000
-[Wed Oct 10 17:30:15 2018] epoch_id: 42, batch_id: 1000, cost: 0.004574, acc: 1.000000
-[Wed Oct 10 17:30:17 2018] epoch_id: 42, batch_id: 1100, cost: 0.004339, acc: 1.000000
-[Wed Oct 10 17:30:19 2018] epoch_id: 42, batch_id: 1200, cost: 0.006596, acc: 1.000000
-[Wed Oct 10 17:30:22 2018] epoch_id: 42, batch_id: 1300, cost: 0.000877, acc: 1.000000
-[Wed Oct 10 17:30:24 2018] epoch_id: 42, batch_id: 1400, cost: 0.001873, acc: 1.000000
-[Wed Oct 10 17:30:26 2018] epoch_id: 42, batch_id: 1500, cost: 0.000632, acc: 1.000000
-[Wed Oct 10 17:30:29 2018] epoch_id: 42, batch_id: 1600, cost: 0.002006, acc: 1.000000
-[Wed Oct 10 17:30:31 2018] epoch_id: 42, batch_id: 1700, cost: 0.002035, acc: 1.000000
-[Wed Oct 10 17:30:33 2018] epoch_id: 42, batch_id: 1800, cost: 0.010094, acc: 1.000000
-[Wed Oct 10 17:30:35 2018] epoch_id: 42, batch_id: 1900, cost: 0.002634, acc: 1.000000
-[Wed Oct 10 17:30:38 2018] epoch_id: 42, batch_id: 2000, cost: 0.045660, acc: 0.984375
-[Wed Oct 10 17:30:40 2018] epoch_id: 42, batch_id: 2100, cost: 0.034275, acc: 0.984375
-[Wed Oct 10 17:30:42 2018] epoch_id: 42, batch_id: 2200, cost: 0.001633, acc: 1.000000
-[Wed Oct 10 17:30:44 2018] epoch_id: 42, batch_id: 2300, cost: 0.001030, acc: 1.000000
-[Wed Oct 10 17:30:47 2018] epoch_id: 42, batch_id: 2400, cost: 0.002235, acc: 1.000000
-[Wed Oct 10 17:30:49 2018] epoch_id: 42, batch_id: 2500, cost: 0.017729, acc: 0.992188
-[Wed Oct 10 17:30:51 2018] epoch_id: 42, batch_id: 2600, cost: 0.004357, acc: 1.000000
-[Wed Oct 10 17:30:53 2018] epoch_id: 42, batch_id: 2700, cost: 0.000981, acc: 1.000000
-[Wed Oct 10 17:30:56 2018] epoch_id: 42, batch_id: 2800, cost: 0.000964, acc: 1.000000
-[Wed Oct 10 17:30:58 2018] epoch_id: 42, batch_id: 2900, cost: 0.018888, acc: 0.992188
-[Wed Oct 10 17:31:00 2018] epoch_id: 42, batch_id: 3000, cost: 0.032965, acc: 0.984375
-
-[Wed Oct 10 17:31:01 2018] epoch_id: 42, train_avg_cost: 0.013007, train_avg_acc: 0.995946
-[Wed Oct 10 17:31:02 2018] epoch_id: 42, dev_cost: 1.701511, accuracy: 0.8335
-[Wed Oct 10 17:31:03 2018] epoch_id: 42, test_cost: 1.704458, accuracy: 0.8312
-
-[Wed Oct 10 17:31:12 2018] epoch_id: 43, batch_id: 0, cost: 0.002044, acc: 1.000000
-[Wed Oct 10 17:31:14 2018] epoch_id: 43, batch_id: 100, cost: 0.018454, acc: 0.992188
-[Wed Oct 10 17:31:16 2018] epoch_id: 43, batch_id: 200, cost: 0.002746, acc: 1.000000
-[Wed Oct 10 17:31:18 2018] epoch_id: 43, batch_id: 300, cost: 0.008316, acc: 0.992188
-[Wed Oct 10 17:31:21 2018] epoch_id: 43, batch_id: 400, cost: 0.009446, acc: 1.000000
-[Wed Oct 10 17:31:23 2018] epoch_id: 43, batch_id: 500, cost: 0.000336, acc: 1.000000
-[Wed Oct 10 17:31:25 2018] epoch_id: 43, batch_id: 600, cost: 0.000436, acc: 1.000000
-[Wed Oct 10 17:31:27 2018] epoch_id: 43, batch_id: 700, cost: 0.000142, acc: 1.000000
-[Wed Oct 10 17:31:30 2018] epoch_id: 43, batch_id: 800, cost: 0.001449, acc: 1.000000
-[Wed Oct 10 17:31:32 2018] epoch_id: 43, batch_id: 900, cost: 0.040274, acc: 0.992188
-[Wed Oct 10 17:31:34 2018] epoch_id: 43, batch_id: 1000, cost: 0.002314, acc: 1.000000
-[Wed Oct 10 17:31:36 2018] epoch_id: 43, batch_id: 1100, cost: 0.008140, acc: 0.992188
-[Wed Oct 10 17:31:39 2018] epoch_id: 43, batch_id: 1200, cost: 0.001320, acc: 1.000000
-[Wed Oct 10 17:31:41 2018] epoch_id: 43, batch_id: 1300, cost: 0.000427, acc: 1.000000
-[Wed Oct 10 17:31:43 2018] epoch_id: 43, batch_id: 1400, cost: 0.004985, acc: 1.000000
-[Wed Oct 10 17:31:46 2018] epoch_id: 43, batch_id: 1500, cost: 0.005165, acc: 1.000000
-[Wed Oct 10 17:31:48 2018] epoch_id: 43, batch_id: 1600, cost: 0.006397, acc: 1.000000
-[Wed Oct 10 17:31:50 2018] epoch_id: 43, batch_id: 1700, cost: 0.026334, acc: 0.984375
-[Wed Oct 10 17:31:54 2018] epoch_id: 43, batch_id: 1800, cost: 0.003058, acc: 1.000000
-[Wed Oct 10 17:31:56 2018] epoch_id: 43, batch_id: 1900, cost: 0.009215, acc: 1.000000
-[Wed Oct 10 17:31:58 2018] epoch_id: 43, batch_id: 2000, cost: 0.005750, acc: 1.000000
-[Wed Oct 10 17:32:01 2018] epoch_id: 43, batch_id: 2100, cost: 0.006973, acc: 1.000000
-[Wed Oct 10 17:32:03 2018] epoch_id: 43, batch_id: 2200, cost: 0.040183, acc: 0.984375
-[Wed Oct 10 17:32:05 2018] epoch_id: 43, batch_id: 2300, cost: 0.007980, acc: 0.992188
-[Wed Oct 10 17:32:07 2018] epoch_id: 43, batch_id: 2400, cost: 0.018794, acc: 0.992188
-[Wed Oct 10 17:32:10 2018] epoch_id: 43, batch_id: 2500, cost: 0.031288, acc: 0.984375
-[Wed Oct 10 17:32:12 2018] epoch_id: 43, batch_id: 2600, cost: 0.010219, acc: 0.992188
-[Wed Oct 10 17:32:14 2018] epoch_id: 43, batch_id: 2700, cost: 0.021514, acc: 0.984375
-[Wed Oct 10 17:32:17 2018] epoch_id: 43, batch_id: 2800, cost: 0.005614, acc: 1.000000
-[Wed Oct 10 17:32:19 2018] epoch_id: 43, batch_id: 2900, cost: 0.065875, acc: 0.984375
-[Wed Oct 10 17:32:21 2018] epoch_id: 43, batch_id: 3000, cost: 0.013279, acc: 0.992188
-
-[Wed Oct 10 17:32:22 2018] epoch_id: 43, train_avg_cost: 0.011822, train_avg_acc: 0.996238
-[Wed Oct 10 17:32:23 2018] epoch_id: 43, dev_cost: 1.703876, accuracy: 0.8322
-[Wed Oct 10 17:32:24 2018] epoch_id: 43, test_cost: 1.724094, accuracy: 0.8315
-
-[Wed Oct 10 17:32:32 2018] epoch_id: 44, batch_id: 0, cost: 0.003358, acc: 1.000000
-[Wed Oct 10 17:32:34 2018] epoch_id: 44, batch_id: 100, cost: 0.003024, acc: 1.000000
-[Wed Oct 10 17:32:37 2018] epoch_id: 44, batch_id: 200, cost: 0.038726, acc: 0.992188
-[Wed Oct 10 17:32:39 2018] epoch_id: 44, batch_id: 300, cost: 0.001766, acc: 1.000000
-[Wed Oct 10 17:32:41 2018] epoch_id: 44, batch_id: 400, cost: 0.005300, acc: 1.000000
-[Wed Oct 10 17:32:43 2018] epoch_id: 44, batch_id: 500, cost: 0.023175, acc: 0.992188
-[Wed Oct 10 17:32:46 2018] epoch_id: 44, batch_id: 600, cost: 0.002893, acc: 1.000000
-[Wed Oct 10 17:32:48 2018] epoch_id: 44, batch_id: 700, cost: 0.025870, acc: 0.976562
-[Wed Oct 10 17:32:50 2018] epoch_id: 44, batch_id: 800, cost: 0.019898, acc: 0.992188
-[Wed Oct 10 17:32:52 2018] epoch_id: 44, batch_id: 900, cost: 0.001718, acc: 1.000000
-[Wed Oct 10 17:32:55 2018] epoch_id: 44, batch_id: 1000, cost: 0.000221, acc: 1.000000
-[Wed Oct 10 17:32:57 2018] epoch_id: 44, batch_id: 1100, cost: 0.002172, acc: 1.000000
-[Wed Oct 10 17:32:59 2018] epoch_id: 44, batch_id: 1200, cost: 0.001158, acc: 1.000000
-[Wed Oct 10 17:33:02 2018] epoch_id: 44, batch_id: 1300, cost: 0.004667, acc: 1.000000
-[Wed Oct 10 17:33:04 2018] epoch_id: 44, batch_id: 1400, cost: 0.000685, acc: 1.000000
-[Wed Oct 10 17:33:06 2018] epoch_id: 44, batch_id: 1500, cost: 0.007730, acc: 1.000000
-[Wed Oct 10 17:33:08 2018] epoch_id: 44, batch_id: 1600, cost: 0.006694, acc: 1.000000
-[Wed Oct 10 17:33:11 2018] epoch_id: 44, batch_id: 1700, cost: 0.009508, acc: 0.992188
-[Wed Oct 10 17:33:13 2018] epoch_id: 44, batch_id: 1800, cost: 0.018037, acc: 0.992188
-[Wed Oct 10 17:33:15 2018] epoch_id: 44, batch_id: 1900, cost: 0.020902, acc: 0.976562
-[Wed Oct 10 17:33:18 2018] epoch_id: 44, batch_id: 2000, cost: 0.006977, acc: 0.992188
-[Wed Oct 10 17:33:20 2018] epoch_id: 44, batch_id: 2100, cost: 0.004821, acc: 1.000000
-[Wed Oct 10 17:33:22 2018] epoch_id: 44, batch_id: 2200, cost: 0.000209, acc: 1.000000
-[Wed Oct 10 17:33:25 2018] epoch_id: 44, batch_id: 2300, cost: 0.008764, acc: 0.992188
-[Wed Oct 10 17:33:27 2018] epoch_id: 44, batch_id: 2400, cost: 0.029171, acc: 0.992188
-[Wed Oct 10 17:33:29 2018] epoch_id: 44, batch_id: 2500, cost: 0.015028, acc: 0.992188
-[Wed Oct 10 17:33:31 2018] epoch_id: 44, batch_id: 2600, cost: 0.007096, acc: 1.000000
-[Wed Oct 10 17:33:33 2018] epoch_id: 44, batch_id: 2700, cost: 0.000547, acc: 1.000000
-[Wed Oct 10 17:33:36 2018] epoch_id: 44, batch_id: 2800, cost: 0.004024, acc: 1.000000
-[Wed Oct 10 17:33:38 2018] epoch_id: 44, batch_id: 2900, cost: 0.002191, acc: 1.000000
-[Wed Oct 10 17:33:40 2018] epoch_id: 44, batch_id: 3000, cost: 0.008875, acc: 1.000000
-
-[Wed Oct 10 17:33:41 2018] epoch_id: 44, train_avg_cost: 0.012328, train_avg_acc: 0.996076
-[Wed Oct 10 17:33:42 2018] epoch_id: 44, dev_cost: 1.575702, accuracy: 0.8331
-[Wed Oct 10 17:33:43 2018] epoch_id: 44, test_cost: 1.573283, accuracy: 0.8313
-
-[Wed Oct 10 17:33:52 2018] epoch_id: 45, batch_id: 0, cost: 0.002271, acc: 1.000000
-[Wed Oct 10 17:33:54 2018] epoch_id: 45, batch_id: 100, cost: 0.005500, acc: 1.000000
-[Wed Oct 10 17:33:56 2018] epoch_id: 45, batch_id: 200, cost: 0.001735, acc: 1.000000
-[Wed Oct 10 17:33:58 2018] epoch_id: 45, batch_id: 300, cost: 0.008910, acc: 1.000000
-[Wed Oct 10 17:34:01 2018] epoch_id: 45, batch_id: 400, cost: 0.010551, acc: 0.992188
-[Wed Oct 10 17:34:03 2018] epoch_id: 45, batch_id: 500, cost: 0.005958, acc: 1.000000
-[Wed Oct 10 17:34:05 2018] epoch_id: 45, batch_id: 600, cost: 0.012035, acc: 0.992188
-[Wed Oct 10 17:34:07 2018] epoch_id: 45, batch_id: 700, cost: 0.002110, acc: 1.000000
-[Wed Oct 10 17:34:10 2018] epoch_id: 45, batch_id: 800, cost: 0.014834, acc: 0.992188
-[Wed Oct 10 17:34:12 2018] epoch_id: 45, batch_id: 900, cost: 0.010944, acc: 0.992188
-[Wed Oct 10 17:34:14 2018] epoch_id: 45, batch_id: 1000, cost: 0.017574, acc: 0.992188
-[Wed Oct 10 17:34:16 2018] epoch_id: 45, batch_id: 1100, cost: 0.006877, acc: 1.000000
-[Wed Oct 10 17:34:19 2018] epoch_id: 45, batch_id: 1200, cost: 0.001731, acc: 1.000000
-[Wed Oct 10 17:34:21 2018] epoch_id: 45, batch_id: 1300, cost: 0.002963, acc: 1.000000
-[Wed Oct 10 17:34:23 2018] epoch_id: 45, batch_id: 1400, cost: 0.009798, acc: 1.000000
-[Wed Oct 10 17:34:25 2018] epoch_id: 45, batch_id: 1500, cost: 0.003309, acc: 1.000000
-[Wed Oct 10 17:34:28 2018] epoch_id: 45, batch_id: 1600, cost: 0.022402, acc: 0.984375
-[Wed Oct 10 17:34:30 2018] epoch_id: 45, batch_id: 1700, cost: 0.003854, acc: 1.000000
-[Wed Oct 10 17:34:32 2018] epoch_id: 45, batch_id: 1800, cost: 0.000418, acc: 1.000000
-[Wed Oct 10 17:34:35 2018] epoch_id: 45, batch_id: 1900, cost: 0.014512, acc: 0.992188
-[Wed Oct 10 17:34:37 2018] epoch_id: 45, batch_id: 2000, cost: 0.031922, acc: 0.992188
-[Wed Oct 10 17:34:39 2018] epoch_id: 45, batch_id: 2100, cost: 0.002671, acc: 1.000000
-[Wed Oct 10 17:34:42 2018] epoch_id: 45, batch_id: 2200, cost: 0.042934, acc: 0.984375
-[Wed Oct 10 17:34:44 2018] epoch_id: 45, batch_id: 2300, cost: 0.008559, acc: 1.000000
-[Wed Oct 10 17:34:46 2018] epoch_id: 45, batch_id: 2400, cost: 0.050518, acc: 0.984375
-[Wed Oct 10 17:34:48 2018] epoch_id: 45, batch_id: 2500, cost: 0.001887, acc: 1.000000
-[Wed Oct 10 17:34:50 2018] epoch_id: 45, batch_id: 2600, cost: 0.002196, acc: 1.000000
-[Wed Oct 10 17:34:54 2018] epoch_id: 45, batch_id: 2700, cost: 0.002765, acc: 1.000000
-[Wed Oct 10 17:34:56 2018] epoch_id: 45, batch_id: 2800, cost: 0.024691, acc: 0.992188
-[Wed Oct 10 17:34:59 2018] epoch_id: 45, batch_id: 2900, cost: 0.003790, acc: 1.000000
-[Wed Oct 10 17:35:01 2018] epoch_id: 45, batch_id: 3000, cost: 0.001317, acc: 1.000000
-
-[Wed Oct 10 17:35:01 2018] epoch_id: 45, train_avg_cost: 0.012084, train_avg_acc: 0.996298
-[Wed Oct 10 17:35:02 2018] epoch_id: 45, dev_cost: 1.603634, accuracy: 0.8321
-[Wed Oct 10 17:35:03 2018] epoch_id: 45, test_cost: 1.609678, accuracy: 0.8291
-
-[Wed Oct 10 17:35:12 2018] epoch_id: 46, batch_id: 0, cost: 0.002291, acc: 1.000000
-[Wed Oct 10 17:35:14 2018] epoch_id: 46, batch_id: 100, cost: 0.018703, acc: 0.992188
-[Wed Oct 10 17:35:16 2018] epoch_id: 46, batch_id: 200, cost: 0.004407, acc: 1.000000
-[Wed Oct 10 17:35:18 2018] epoch_id: 46, batch_id: 300, cost: 0.000953, acc: 1.000000
-[Wed Oct 10 17:35:21 2018] epoch_id: 46, batch_id: 400, cost: 0.000732, acc: 1.000000
-[Wed Oct 10 17:35:23 2018] epoch_id: 46, batch_id: 500, cost: 0.011275, acc: 0.992188
-[Wed Oct 10 17:35:25 2018] epoch_id: 46, batch_id: 600, cost: 0.009521, acc: 1.000000
-[Wed Oct 10 17:35:27 2018] epoch_id: 46, batch_id: 700, cost: 0.000671, acc: 1.000000
-[Wed Oct 10 17:35:30 2018] epoch_id: 46, batch_id: 800, cost: 0.000768, acc: 1.000000
-[Wed Oct 10 17:35:32 2018] epoch_id: 46, batch_id: 900, cost: 0.001357, acc: 1.000000
-[Wed Oct 10 17:35:34 2018] epoch_id: 46, batch_id: 1000, cost: 0.001384, acc: 1.000000
-[Wed Oct 10 17:35:37 2018] epoch_id: 46, batch_id: 1100, cost: 0.010220, acc: 0.992188
-[Wed Oct 10 17:35:39 2018] epoch_id: 46, batch_id: 1200, cost: 0.006540, acc: 1.000000
-[Wed Oct 10 17:35:41 2018] epoch_id: 46, batch_id: 1300, cost: 0.002771, acc: 1.000000
-[Wed Oct 10 17:35:44 2018] epoch_id: 46, batch_id: 1400, cost: 0.010623, acc: 0.992188
-[Wed Oct 10 17:35:46 2018] epoch_id: 46, batch_id: 1500, cost: 0.000798, acc: 1.000000
-[Wed Oct 10 17:35:48 2018] epoch_id: 46, batch_id: 1600, cost: 0.004519, acc: 1.000000
-[Wed Oct 10 17:35:50 2018] epoch_id: 46, batch_id: 1700, cost: 0.010096, acc: 1.000000
-[Wed Oct 10 17:35:53 2018] epoch_id: 46, batch_id: 1800, cost: 0.001868, acc: 1.000000
-[Wed Oct 10 17:35:55 2018] epoch_id: 46, batch_id: 1900, cost: 0.039460, acc: 0.984375
-[Wed Oct 10 17:35:57 2018] epoch_id: 46, batch_id: 2000, cost: 0.008906, acc: 1.000000
-[Wed Oct 10 17:35:59 2018] epoch_id: 46, batch_id: 2100, cost: 0.008440, acc: 0.992188
-[Wed Oct 10 17:36:02 2018] epoch_id: 46, batch_id: 2200, cost: 0.014774, acc: 0.992188
-[Wed Oct 10 17:36:05 2018] epoch_id: 46, batch_id: 2300, cost: 0.016775, acc: 0.992188
-[Wed Oct 10 17:36:07 2018] epoch_id: 46, batch_id: 2400, cost: 0.008999, acc: 0.992188
-[Wed Oct 10 17:36:10 2018] epoch_id: 46, batch_id: 2500, cost: 0.001394, acc: 1.000000
-[Wed Oct 10 17:36:12 2018] epoch_id: 46, batch_id: 2600, cost: 0.005627, acc: 1.000000
-[Wed Oct 10 17:36:14 2018] epoch_id: 46, batch_id: 2700, cost: 0.003667, acc: 1.000000
-[Wed Oct 10 17:36:16 2018] epoch_id: 46, batch_id: 2800, cost: 0.016338, acc: 0.992188
-[Wed Oct 10 17:36:19 2018] epoch_id: 46, batch_id: 2900, cost: 0.005622, acc: 1.000000
-[Wed Oct 10 17:36:21 2018] epoch_id: 46, batch_id: 3000, cost: 0.003068, acc: 1.000000
-
-[Wed Oct 10 17:36:21 2018] epoch_id: 46, train_avg_cost: 0.011216, train_avg_acc: 0.996266
-[Wed Oct 10 17:36:22 2018] epoch_id: 46, dev_cost: 1.772260, accuracy: 0.8309
-[Wed Oct 10 17:36:23 2018] epoch_id: 46, test_cost: 1.783967, accuracy: 0.8284
-
-[Wed Oct 10 17:36:32 2018] epoch_id: 47, batch_id: 0, cost: 0.001193, acc: 1.000000
-[Wed Oct 10 17:36:34 2018] epoch_id: 47, batch_id: 100, cost: 0.000584, acc: 1.000000
-[Wed Oct 10 17:36:36 2018] epoch_id: 47, batch_id: 200, cost: 0.001534, acc: 1.000000
-[Wed Oct 10 17:36:38 2018] epoch_id: 47, batch_id: 300, cost: 0.014105, acc: 0.992188
-[Wed Oct 10 17:36:41 2018] epoch_id: 47, batch_id: 400, cost: 0.000929, acc: 1.000000
-[Wed Oct 10 17:36:43 2018] epoch_id: 47, batch_id: 500, cost: 0.007649, acc: 0.992188
-[Wed Oct 10 17:36:45 2018] epoch_id: 47, batch_id: 600, cost: 0.009973, acc: 0.992188
-[Wed Oct 10 17:36:47 2018] epoch_id: 47, batch_id: 700, cost: 0.006471, acc: 1.000000
-[Wed Oct 10 17:36:49 2018] epoch_id: 47, batch_id: 800, cost: 0.002720, acc: 1.000000
-[Wed Oct 10 17:36:53 2018] epoch_id: 47, batch_id: 900, cost: 0.001402, acc: 1.000000
-[Wed Oct 10 17:36:55 2018] epoch_id: 47, batch_id: 1000, cost: 0.000697, acc: 1.000000
-[Wed Oct 10 17:36:57 2018] epoch_id: 47, batch_id: 1100, cost: 0.001998, acc: 1.000000
-[Wed Oct 10 17:36:59 2018] epoch_id: 47, batch_id: 1200, cost: 0.009035, acc: 0.992188
-[Wed Oct 10 17:37:02 2018] epoch_id: 47, batch_id: 1300, cost: 0.006139, acc: 1.000000
-[Wed Oct 10 17:37:04 2018] epoch_id: 47, batch_id: 1400, cost: 0.007283, acc: 1.000000
-[Wed Oct 10 17:37:06 2018] epoch_id: 47, batch_id: 1500, cost: 0.016960, acc: 0.992188
-[Wed Oct 10 17:37:08 2018] epoch_id: 47, batch_id: 1600, cost: 0.001158, acc: 1.000000
-[Wed Oct 10 17:37:11 2018] epoch_id: 47, batch_id: 1700, cost: 0.001425, acc: 1.000000
-[Wed Oct 10 17:37:13 2018] epoch_id: 47, batch_id: 1800, cost: 0.001285, acc: 1.000000
-[Wed Oct 10 17:37:15 2018] epoch_id: 47, batch_id: 1900, cost: 0.002734, acc: 1.000000
-[Wed Oct 10 17:37:17 2018] epoch_id: 47, batch_id: 2000, cost: 0.000576, acc: 1.000000
-[Wed Oct 10 17:37:20 2018] epoch_id: 47, batch_id: 2100, cost: 0.001285, acc: 1.000000
-[Wed Oct 10 17:37:22 2018] epoch_id: 47, batch_id: 2200, cost: 0.000798, acc: 1.000000
-[Wed Oct 10 17:37:24 2018] epoch_id: 47, batch_id: 2300, cost: 0.059468, acc: 0.984375
-[Wed Oct 10 17:37:26 2018] epoch_id: 47, batch_id: 2400, cost: 0.004177, acc: 1.000000
-[Wed Oct 10 17:37:29 2018] epoch_id: 47, batch_id: 2500, cost: 0.001915, acc: 1.000000
-[Wed Oct 10 17:37:31 2018] epoch_id: 47, batch_id: 2600, cost: 0.000491, acc: 1.000000
-[Wed Oct 10 17:37:33 2018] epoch_id: 47, batch_id: 2700, cost: 0.001129, acc: 1.000000
-[Wed Oct 10 17:37:35 2018] epoch_id: 47, batch_id: 2800, cost: 0.000988, acc: 1.000000
-[Wed Oct 10 17:37:38 2018] epoch_id: 47, batch_id: 2900, cost: 0.024258, acc: 0.992188
-[Wed Oct 10 17:37:40 2018] epoch_id: 47, batch_id: 3000, cost: 0.000902, acc: 1.000000
-
-[Wed Oct 10 17:37:41 2018] epoch_id: 47, train_avg_cost: 0.011536, train_avg_acc: 0.996225
-[Wed Oct 10 17:37:42 2018] epoch_id: 47, dev_cost: 2.235156, accuracy: 0.8326
-[Wed Oct 10 17:37:43 2018] epoch_id: 47, test_cost: 2.289617, accuracy: 0.8318
-
-[Wed Oct 10 17:37:51 2018] epoch_id: 48, batch_id: 0, cost: 0.007975, acc: 1.000000
-[Wed Oct 10 17:37:54 2018] epoch_id: 48, batch_id: 100, cost: 0.000491, acc: 1.000000
-[Wed Oct 10 17:37:56 2018] epoch_id: 48, batch_id: 200, cost: 0.015161, acc: 0.992188
-[Wed Oct 10 17:37:58 2018] epoch_id: 48, batch_id: 300, cost: 0.030692, acc: 0.992188
-[Wed Oct 10 17:38:00 2018] epoch_id: 48, batch_id: 400, cost: 0.016749, acc: 0.992188
-[Wed Oct 10 17:38:03 2018] epoch_id: 48, batch_id: 500, cost: 0.005637, acc: 1.000000
-[Wed Oct 10 17:38:05 2018] epoch_id: 48, batch_id: 600, cost: 0.014267, acc: 0.992188
-[Wed Oct 10 17:38:07 2018] epoch_id: 48, batch_id: 700, cost: 0.002352, acc: 1.000000
-[Wed Oct 10 17:38:10 2018] epoch_id: 48, batch_id: 800, cost: 0.002758, acc: 1.000000
-[Wed Oct 10 17:38:12 2018] epoch_id: 48, batch_id: 900, cost: 0.000367, acc: 1.000000
-[Wed Oct 10 17:38:14 2018] epoch_id: 48, batch_id: 1000, cost: 0.003479, acc: 1.000000
-[Wed Oct 10 17:38:16 2018] epoch_id: 48, batch_id: 1100, cost: 0.006107, acc: 1.000000
-[Wed Oct 10 17:38:19 2018] epoch_id: 48, batch_id: 1200, cost: 0.000989, acc: 1.000000
-[Wed Oct 10 17:38:21 2018] epoch_id: 48, batch_id: 1300, cost: 0.000442, acc: 1.000000
-[Wed Oct 10 17:38:23 2018] epoch_id: 48, batch_id: 1400, cost: 0.002006, acc: 1.000000
-[Wed Oct 10 17:38:25 2018] epoch_id: 48, batch_id: 1500, cost: 0.022174, acc: 0.992188
-[Wed Oct 10 17:38:28 2018] epoch_id: 48, batch_id: 1600, cost: 0.004670, acc: 1.000000
-[Wed Oct 10 17:38:30 2018] epoch_id: 48, batch_id: 1700, cost: 0.014862, acc: 0.992188
-[Wed Oct 10 17:38:32 2018] epoch_id: 48, batch_id: 1800, cost: 0.004648, acc: 1.000000
-[Wed Oct 10 17:38:36 2018] epoch_id: 48, batch_id: 1900, cost: 0.035342, acc: 0.992188
-[Wed Oct 10 17:38:38 2018] epoch_id: 48, batch_id: 2000, cost: 0.018578, acc: 0.992188
-[Wed Oct 10 17:38:40 2018] epoch_id: 48, batch_id: 2100, cost: 0.003790, acc: 1.000000
-[Wed Oct 10 17:38:42 2018] epoch_id: 48, batch_id: 2200, cost: 0.026731, acc: 0.984375
-[Wed Oct 10 17:38:45 2018] epoch_id: 48, batch_id: 2300, cost: 0.003608, acc: 1.000000
-[Wed Oct 10 17:38:47 2018] epoch_id: 48, batch_id: 2400, cost: 0.005601, acc: 1.000000
-[Wed Oct 10 17:38:49 2018] epoch_id: 48, batch_id: 2500, cost: 0.000833, acc: 1.000000
-[Wed Oct 10 17:38:52 2018] epoch_id: 48, batch_id: 2600, cost: 0.004157, acc: 1.000000
-[Wed Oct 10 17:38:54 2018] epoch_id: 48, batch_id: 2700, cost: 0.010146, acc: 0.992188
-[Wed Oct 10 17:38:56 2018] epoch_id: 48, batch_id: 2800, cost: 0.001127, acc: 1.000000
-[Wed Oct 10 17:38:58 2018] epoch_id: 48, batch_id: 2900, cost: 0.004332, acc: 1.000000
-[Wed Oct 10 17:39:01 2018] epoch_id: 48, batch_id: 3000, cost: 0.004895, acc: 1.000000
-
-[Wed Oct 10 17:39:01 2018] epoch_id: 48, train_avg_cost: 0.010959, train_avg_acc: 0.996475
-[Wed Oct 10 17:39:02 2018] epoch_id: 48, dev_cost: 1.764490, accuracy: 0.8343
-[Wed Oct 10 17:39:03 2018] epoch_id: 48, test_cost: 1.826369, accuracy: 0.8296
-
-[Wed Oct 10 17:39:12 2018] epoch_id: 49, batch_id: 0, cost: 0.004527, acc: 1.000000
-[Wed Oct 10 17:39:14 2018] epoch_id: 49, batch_id: 100, cost: 0.003537, acc: 1.000000
-[Wed Oct 10 17:39:16 2018] epoch_id: 49, batch_id: 200, cost: 0.034318, acc: 0.992188
-[Wed Oct 10 17:39:19 2018] epoch_id: 49, batch_id: 300, cost: 0.024897, acc: 0.992188
-[Wed Oct 10 17:39:21 2018] epoch_id: 49, batch_id: 400, cost: 0.002212, acc: 1.000000
-[Wed Oct 10 17:39:23 2018] epoch_id: 49, batch_id: 500, cost: 0.012678, acc: 0.992188
-[Wed Oct 10 17:39:25 2018] epoch_id: 49, batch_id: 600, cost: 0.006081, acc: 1.000000
-[Wed Oct 10 17:39:28 2018] epoch_id: 49, batch_id: 700, cost: 0.004294, acc: 1.000000
-[Wed Oct 10 17:39:30 2018] epoch_id: 49, batch_id: 800, cost: 0.000339, acc: 1.000000
-[Wed Oct 10 17:39:32 2018] epoch_id: 49, batch_id: 900, cost: 0.006350, acc: 0.992188
-[Wed Oct 10 17:39:35 2018] epoch_id: 49, batch_id: 1000, cost: 0.002183, acc: 1.000000
-[Wed Oct 10 17:39:37 2018] epoch_id: 49, batch_id: 1100, cost: 0.006977, acc: 1.000000
-[Wed Oct 10 17:39:39 2018] epoch_id: 49, batch_id: 1200, cost: 0.003140, acc: 1.000000
-[Wed Oct 10 17:39:41 2018] epoch_id: 49, batch_id: 1300, cost: 0.003361, acc: 1.000000
-[Wed Oct 10 17:39:44 2018] epoch_id: 49, batch_id: 1400, cost: 0.002039, acc: 1.000000
-[Wed Oct 10 17:39:46 2018] epoch_id: 49, batch_id: 1500, cost: 0.001850, acc: 1.000000
-[Wed Oct 10 17:39:48 2018] epoch_id: 49, batch_id: 1600, cost: 0.045419, acc: 0.992188
-[Wed Oct 10 17:39:50 2018] epoch_id: 49, batch_id: 1700, cost: 0.000883, acc: 1.000000
-[Wed Oct 10 17:39:53 2018] epoch_id: 49, batch_id: 1800, cost: 0.002086, acc: 1.000000
-[Wed Oct 10 17:39:55 2018] epoch_id: 49, batch_id: 1900, cost: 0.014964, acc: 0.992188
-[Wed Oct 10 17:39:57 2018] epoch_id: 49, batch_id: 2000, cost: 0.002001, acc: 1.000000
-[Wed Oct 10 17:39:59 2018] epoch_id: 49, batch_id: 2100, cost: 0.013663, acc: 0.984375
-[Wed Oct 10 17:40:02 2018] epoch_id: 49, batch_id: 2200, cost: 0.013116, acc: 0.992188
-[Wed Oct 10 17:40:04 2018] epoch_id: 49, batch_id: 2300, cost: 0.002713, acc: 1.000000
-[Wed Oct 10 17:40:06 2018] epoch_id: 49, batch_id: 2400, cost: 0.004193, acc: 1.000000
-[Wed Oct 10 17:40:08 2018] epoch_id: 49, batch_id: 2500, cost: 0.001507, acc: 1.000000
-[Wed Oct 10 17:40:11 2018] epoch_id: 49, batch_id: 2600, cost: 0.034837, acc: 0.992188
-[Wed Oct 10 17:40:13 2018] epoch_id: 49, batch_id: 2700, cost: 0.006245, acc: 1.000000
-[Wed Oct 10 17:40:15 2018] epoch_id: 49, batch_id: 2800, cost: 0.003659, acc: 1.000000
-[Wed Oct 10 17:40:17 2018] epoch_id: 49, batch_id: 2900, cost: 0.002175, acc: 1.000000
-[Wed Oct 10 17:40:19 2018] epoch_id: 49, batch_id: 3000, cost: 0.000767, acc: 1.000000
-
-[Wed Oct 10 17:40:20 2018] epoch_id: 49, train_avg_cost: 0.011233, train_avg_acc: 0.996326
-[Wed Oct 10 17:40:21 2018] epoch_id: 49, dev_cost: 1.652680, accuracy: 0.8353
-[Wed Oct 10 17:40:22 2018] epoch_id: 49, test_cost: 1.685406, accuracy: 0.8324
-
diff --git a/PaddleRec/text_matching_on_quora/configs/__init__.py b/PaddleRec/text_matching_on_quora/configs/__init__.py
deleted file mode 100755
index 5711d76f3c8f838ab9e2bf2cc4fa5e3ccb288563..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/configs/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .cdssm import cdssm_base
-from .dec_att import decatt_glove
-from .sse import sse_base
-from .infer_sent import infer_sent_v1
-from .infer_sent import infer_sent_v2
diff --git a/PaddleRec/text_matching_on_quora/configs/basic_config.py b/PaddleRec/text_matching_on_quora/configs/basic_config.py
deleted file mode 100755
index 70c2ee06897116d71742a7eb86f3ea912a67dce4..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/configs/basic_config.py
+++ /dev/null
@@ -1,56 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from __future__ import print_function
-
-
-class config(object):
-    def __init__(self):
-        self.batch_size = 128
-        self.epoch_num = 50
-
-        self.optimizer_type = 'adam'  # sgd, adagrad
-
-        # pretrained word embedding 
-        self.use_pretrained_word_embedding = True
-        # when employing pretrained word embedding,  
-        # out of vocabulary words' embedding is initialized with uniform or normal numbers
-        self.OOV_fill = 'uniform'
-        self.embedding_norm = False
-
-        # or else, use padding and masks for sequence data
-        self.use_lod_tensor = True
-
-        # lr = lr * lr_decay after each epoch
-        self.lr_decay = 1
-        self.learning_rate = 0.001
-
-        self.save_dirname = 'model_dir'
-
-        self.train_samples_num = 384348
-        self.duplicate_data = False
-
-        self.metric_type = ['accuracy']
-
-    def list_config(self):
-        print("config", self.__dict__)
-
-    def has_member(self, var_name):
-        return var_name in self.__dict__
-
-
-if __name__ == "__main__":
-    basic = config()
-    basic.list_config()
-    basic.ahh = 2
-    basic.list_config()
diff --git a/PaddleRec/text_matching_on_quora/configs/cdssm.py b/PaddleRec/text_matching_on_quora/configs/cdssm.py
deleted file mode 100755
index b773d4cacb2bb21468a6208fd6317ad1e9779c92..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/configs/cdssm.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import basic_config
-
-
-def cdssm_base():
-    """
-    set configs
-    """
-    config = basic_config.config()
-    config.learning_rate = 0.001
-    config.save_dirname = "model_dir"
-    config.use_pretrained_word_embedding = True
-    config.dict_dim = 40000  # approx_vocab_size
-
-    # net config
-    config.emb_dim = 300
-    config.kernel_size = 5
-    config.kernel_count = 300
-    config.fc_dim = 128
-    config.mlp_hid_dim = [128, 128]
-    config.droprate_conv = 0.1
-    config.droprate_fc = 0.1
-    config.class_dim = 2
-
-    return config
diff --git a/PaddleRec/text_matching_on_quora/configs/dec_att.py b/PaddleRec/text_matching_on_quora/configs/dec_att.py
deleted file mode 100755
index dbb9977e445ff3304a2dd19665d8e7760726ed4e..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/configs/dec_att.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import basic_config
-
-
-def decatt_glove():
-    """
-    use config 'decAtt_glove' in the paper 'Neural Paraphrase Identification of Questions with Noisy Pretraining'
-    """
-    config = basic_config.config()
-    config.learning_rate = 0.05
-    config.save_dirname = "model_dir"
-    config.use_pretrained_word_embedding = True
-    config.dict_dim = 40000  # approx_vocab_size
-    config.metric_type = ['accuracy', 'accuracy_with_threshold']
-    config.optimizer_type = 'sgd'
-    config.lr_decay = 1
-    config.use_lod_tensor = False
-    config.embedding_norm = False
-    config.OOV_fill = 'uniform'
-    config.duplicate_data = False
-
-    # net config
-    config.emb_dim = 300
-    config.proj_emb_dim = 200  #TODO: has project?
-    config.num_units = [400, 200]
-    config.word_embedding_trainable = True
-    config.droprate = 0.1
-    config.share_wight_btw_seq = True
-    config.class_dim = 2
-
-    return config
-
-
-def decatt_word():
-    """
-    use config 'decAtt_glove' in the paper 'Neural Paraphrase Identification of Questions with Noisy Pretraining'
-    """
-    config = basic_config.config()
-    config.learning_rate = 0.05
-    config.save_dirname = "model_dir"
-    config.use_pretrained_word_embedding = False
-    config.dict_dim = 40000  # approx_vocab_size
-    config.metric_type = ['accuracy', 'accuracy_with_threshold']
-    config.optimizer_type = 'sgd'
-    config.lr_decay = 1
-    config.use_lod_tensor = False
-    config.embedding_norm = False
-    config.OOV_fill = 'uniform'
-    config.duplicate_data = False
-
-    # net config
-    config.emb_dim = 300
-    config.proj_emb_dim = 200  #TODO: has project?
-    config.num_units = [400, 200]
-    config.word_embedding_trainable = True
-    config.droprate = 0.1
-    config.share_wight_btw_seq = True
-    config.class_dim = 2
-
-    return config
diff --git a/PaddleRec/text_matching_on_quora/configs/infer_sent.py b/PaddleRec/text_matching_on_quora/configs/infer_sent.py
deleted file mode 100755
index 896672c49806ace936c20fe27a865ea6d59254bc..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/configs/infer_sent.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import basic_config
-
-
-def infer_sent_v1():
-    """
-    set configs
-    """
-    config = basic_config.config()
-    config.learning_rate = 0.1
-    config.lr_decay = 0.99
-    config.optimizer_type = 'sgd'
-    config.save_dirname = "model_dir"
-    config.use_pretrained_word_embedding = True
-    config.dict_dim = 40000  # approx_vocab_size
-    config.class_dim = 2
-
-    # net config
-    config.emb_dim = 300
-    config.droprate_lstm = 0.0
-    config.droprate_fc = 0.0
-    config.word_embedding_trainable = False
-    config.rnn_hid_dim = 2048
-    config.mlp_non_linear = False
-
-    return config
-
-
-def infer_sent_v2():
-    """
-    use our own config
-    """
-    config = basic_config.config()
-    config.learning_rate = 0.0002
-    config.lr_decay = 0.99
-    config.optimizer_type = 'adam'
-    config.save_dirname = "model_dir"
-    config.use_pretrained_word_embedding = True
-    config.dict_dim = 40000  # approx_vocab_size
-    config.class_dim = 2
-
-    # net config
-    config.emb_dim = 300
-    config.droprate_lstm = 0.0
-    config.droprate_fc = 0.2
-    config.word_embedding_trainable = False
-    config.rnn_hid_dim = 2048
-    config.mlp_non_linear = True
-
-    return config
diff --git a/PaddleRec/text_matching_on_quora/configs/sse.py b/PaddleRec/text_matching_on_quora/configs/sse.py
deleted file mode 100755
index 4966465ff2be4cb53d28cfe2be79b0e754f4f977..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/configs/sse.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from . import basic_config
-
-
-def sse_base():
-    """
-    use config in the paper 'Shortcut-Stacked Sentence Encoders for Multi-Domain Inference'
-    """
-    config = basic_config.config()
-    config.learning_rate = 0.0002
-    config.lr_decay = 0.7
-    config.save_dirname = "model_dir"
-    config.use_pretrained_word_embedding = True
-    config.dict_dim = 40000  # approx_vocab_size
-    config.metric_type = ['accuracy']
-    config.optimizer_type = 'adam'
-    config.use_lod_tensor = True
-    config.embedding_norm = False
-    config.OOV_fill = 'uniform'
-    config.duplicate_data = False
-
-    # net config
-    config.emb_dim = 300
-    config.rnn_hid_dim = [512, 1024, 2048]
-    config.fc_dim = [1600, 1600]
-    config.droprate_lstm = 0.0
-    config.droprate_fc = 0.1
-    config.class_dim = 2
-
-    return config
diff --git a/PaddleRec/text_matching_on_quora/data/prepare_quora_data.sh b/PaddleRec/text_matching_on_quora/data/prepare_quora_data.sh
deleted file mode 100755
index 111c2b8886f304ad1f8bc31cb662506044a9535b..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/data/prepare_quora_data.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Please download the Quora dataset firstly from https://drive.google.com/file/d/0B0PlTAo--BnaQWlsZl9FZ3l1c28/view?usp=sharing
-# to the ROOT_DIR: $HOME/.cache/paddle/dataset
-
-DATA_DIR=$HOME/.cache/paddle/dataset
-wget --directory-prefix=$DATA_DIR http://nlp.stanford.edu/data/glove.840B.300d.zip
-
-unzip $DATA_DIR/glove.840B.300d.zip
-
-# The finally dataset dir should be like
-
-# $HOME/.cache/paddle/dataset
-# |- Quora_question_pair_partition
-#     |- train.tsv
-#     |- test.tsv
-#     |- dev.tsv
-#     |- readme.txt
-#     |- wordvec.txt
-# |- glove.840B.300d.txt
diff --git a/PaddleRec/text_matching_on_quora/imgs/README.md b/PaddleRec/text_matching_on_quora/imgs/README.md
deleted file mode 100644
index 60f55a85a7f564e758d421b9ee10a316e1573e8c..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/imgs/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Image files for this model: text_matching_on_quora
diff --git a/PaddleRec/text_matching_on_quora/imgs/models_test_acc.png b/PaddleRec/text_matching_on_quora/imgs/models_test_acc.png
deleted file mode 100644
index 12b76682123111235f9fb85572431e2e36ce5334..0000000000000000000000000000000000000000
Binary files a/PaddleRec/text_matching_on_quora/imgs/models_test_acc.png and /dev/null differ
diff --git a/PaddleRec/text_matching_on_quora/metric.py b/PaddleRec/text_matching_on_quora/metric.py
deleted file mode 100755
index 382bce6652f08aa1aa8ff3a8b7ad3958b7a56e8a..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/metric.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-"""
-This Module defines evaluate metrics for classification tasks
-"""
-
-
-def accuracy(y_pred, label):
-    """
-    define correct: the top 1 class in y_pred is the same as y_true
-    """
-    y_pred = np.squeeze(y_pred)
-    y_pred_idx = np.argmax(y_pred, axis=1)
-    return 1.0 * np.sum(y_pred_idx == label) / label.shape[0]
-
-
-def accuracy_with_threshold(y_pred, label, threshold=0.5):
-    """
-    define correct: the y_true class's prob in y_pred is bigger than threshold
-    when threshold is 0.5, This fuction is equal to accuracy
-    """
-    y_pred = np.squeeze(y_pred)
-    y_pred_idx = (y_pred[:, 1] > threshold).astype(int)
-    return 1.0 * np.sum(y_pred_idx == label) / label.shape[0]
diff --git a/PaddleRec/text_matching_on_quora/models/__init__.py b/PaddleRec/text_matching_on_quora/models/__init__.py
deleted file mode 100755
index a52665d415e198b3fc47c1ee3f3bc7fa0bde4971..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/models/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .cdssm import cdssmNet
-from .dec_att import DecAttNet
-from .sse import SSENet
-from .infer_sent import InferSentNet
diff --git a/PaddleRec/text_matching_on_quora/models/cdssm.py b/PaddleRec/text_matching_on_quora/models/cdssm.py
deleted file mode 100755
index 334cfebb092e9d691ac7fbdd54f1f5429551cdb2..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/models/cdssm.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-
-
-class cdssmNet():
-    """cdssm net"""
-
-    def __init__(self, config):
-        self._config = config
-
-    def __call__(self, seq1, seq2, label):
-        return self.body(seq1, seq2, label, self._config)
-
-    def body(self, seq1, seq2, label, config):
-        """Body function"""
-
-        def conv_model(seq):
-            embed = fluid.layers.embedding(
-                input=seq,
-                size=[config.dict_dim, config.emb_dim],
-                param_attr='emb.w')
-            conv = fluid.layers.sequence_conv(
-                embed,
-                num_filters=config.kernel_count,
-                filter_size=config.kernel_size,
-                filter_stride=1,
-                padding=True,  # TODO: what is padding
-                bias_attr=False,
-                param_attr='conv1d.w',
-                act='relu')
-            #print paddle.parameters.get('conv1d.w').shape
-
-            conv = fluid.layers.dropout(conv, dropout_prob=config.droprate_conv)
-            pool = fluid.layers.sequence_pool(conv, pool_type="max")
-            fc = fluid.layers.fc(pool,
-                                 size=config.fc_dim,
-                                 param_attr='fc1.w',
-                                 bias_attr='fc1.b',
-                                 act='relu')
-            return fc
-
-        def MLP(vec):
-            for dim in config.mlp_hid_dim:
-                vec = fluid.layers.fc(vec, size=dim, act='relu')
-                vec = fluid.layers.dropout(vec, dropout_prob=config.droprate_fc)
-            return vec
-
-        seq1_fc = conv_model(seq1)
-        seq2_fc = conv_model(seq2)
-        concated_seq = fluid.layers.concat(input=[seq1_fc, seq2_fc], axis=1)
-        mlp_res = MLP(concated_seq)
-        prediction = fluid.layers.fc(mlp_res,
-                                     size=config.class_dim,
-                                     act='softmax')
-        loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_cost = fluid.layers.mean(x=loss)
-        acc = fluid.layers.accuracy(input=prediction, label=label)
-        return avg_cost, acc, prediction
diff --git a/PaddleRec/text_matching_on_quora/models/dec_att.py b/PaddleRec/text_matching_on_quora/models/dec_att.py
deleted file mode 100755
index 4c3fecbe2b6fbea766dc353688e64780be967b97..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/models/dec_att.py
+++ /dev/null
@@ -1,159 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-
-
-class DecAttNet():
-    """decompose attention net"""
-
-    def __init__(self, config):
-        self._config = config
-        self.initializer = fluid.initializer.Xavier(uniform=False)
-
-    def __call__(self, seq1, seq2, mask1, mask2, label):
-        return self.body(seq1, seq2, mask1, mask2, label)
-
-    def body(self, seq1, seq2, mask1, mask2, label):
-        """Body function"""
-        transformed_q1 = self.transformation(seq1)
-        transformed_q2 = self.transformation(seq2)
-        masked_q1 = self.apply_mask(transformed_q1, mask1)
-        masked_q2 = self.apply_mask(transformed_q2, mask2)
-        alpha, beta = self.attend(masked_q1, masked_q2)
-        if self._config.share_wight_btw_seq:
-            seq1_compare = self.compare(masked_q1, beta, param_prefix='compare')
-            seq2_compare = self.compare(
-                masked_q2, alpha, param_prefix='compare')
-        else:
-            seq1_compare = self.compare(
-                masked_q1, beta, param_prefix='compare_1')
-            seq2_compare = self.compare(
-                masked_q2, alpha, param_prefix='compare_2')
-        aggregate_res = self.aggregate(seq1_compare, seq2_compare)
-        prediction = fluid.layers.fc(aggregate_res,
-                                     size=self._config.class_dim,
-                                     act='softmax')
-        loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_cost = fluid.layers.mean(x=loss)
-        acc = fluid.layers.accuracy(input=prediction, label=label)
-        return avg_cost, acc, prediction
-
-    def apply_mask(self, seq, mask):
-        """
-       apply mask on seq
-       Input: seq in shape [batch_size, seq_len, embedding_size]
-       Input: mask in shape [batch_size, seq_len]
-       Output: masked seq in shape [batch_size, seq_len, embedding_size]
-       """
-        return fluid.layers.elementwise_mul(x=seq, y=mask, axis=0)
-
-    def feed_forward_2d(self, vec, param_prefix):
-        """
-        Input: vec in shape [batch_size, seq_len, vec_dim]
-        Output: fc2 in shape [batch_size, seq_len, num_units[1]]
-        """
-        fc1 = fluid.layers.fc(vec,
-                              size=self._config.num_units[0],
-                              num_flatten_dims=2,
-                              param_attr=fluid.ParamAttr(
-                                  name=param_prefix + '_fc1.w',
-                                  initializer=self.initializer),
-                              bias_attr=param_prefix + '_fc1.b',
-                              act='relu')
-        fc1 = fluid.layers.dropout(fc1, dropout_prob=self._config.droprate)
-        fc2 = fluid.layers.fc(fc1,
-                              size=self._config.num_units[1],
-                              num_flatten_dims=2,
-                              param_attr=fluid.ParamAttr(
-                                  name=param_prefix + '_fc2.w',
-                                  initializer=self.initializer),
-                              bias_attr=param_prefix + '_fc2.b',
-                              act='relu')
-        fc2 = fluid.layers.dropout(fc2, dropout_prob=self._config.droprate)
-        return fc2
-
-    def feed_forward(self, vec, param_prefix):
-        """
-        Input: vec in shape [batch_size, vec_dim]
-        Output: fc2 in shape [batch_size, num_units[1]]
-        """
-        fc1 = fluid.layers.fc(vec,
-                              size=self._config.num_units[0],
-                              num_flatten_dims=1,
-                              param_attr=fluid.ParamAttr(
-                                  name=param_prefix + '_fc1.w',
-                                  initializer=self.initializer),
-                              bias_attr=param_prefix + '_fc1.b',
-                              act='relu')
-        fc1 = fluid.layers.dropout(fc1, dropout_prob=self._config.droprate)
-        fc2 = fluid.layers.fc(fc1,
-                              size=self._config.num_units[1],
-                              num_flatten_dims=1,
-                              param_attr=fluid.ParamAttr(
-                                  name=param_prefix + '_fc2.w',
-                                  initializer=self.initializer),
-                              bias_attr=param_prefix + '_fc2.b',
-                              act='relu')
-        fc2 = fluid.layers.dropout(fc2, dropout_prob=self._config.droprate)
-        return fc2
-
-    def transformation(self, seq):
-        embed = fluid.layers.embedding(
-            input=seq,
-            size=[self._config.dict_dim, self._config.emb_dim],
-            param_attr=fluid.ParamAttr(
-                name='emb.w', trainable=self._config.word_embedding_trainable))
-        if self._config.proj_emb_dim is not None:
-            return fluid.layers.fc(embed,
-                                   size=self._config.proj_emb_dim,
-                                   num_flatten_dims=2,
-                                   param_attr=fluid.ParamAttr(
-                                       name='project' + '_fc1.w',
-                                       initializer=self.initializer),
-                                   bias_attr=False,
-                                   act=None)
-        return embed
-
-    def attend(self, seq1, seq2):
-        """
-        Input: seq1, shape [batch_size, seq_len1, embed_size]
-        Input: seq2, shape [batch_size, seq_len2, embed_size]
-        Output: alpha, shape [batch_size, seq_len1, embed_size]
-        Output: beta, shape [batch_size, seq_len2, embed_size]
-        """
-        if self._config.share_wight_btw_seq:
-            seq1 = self.feed_forward_2d(seq1, param_prefix="attend")
-            seq2 = self.feed_forward_2d(seq2, param_prefix="attend")
-        else:
-            seq1 = self.feed_forward_2d(seq1, param_prefix="attend_1")
-            seq2 = self.feed_forward_2d(seq2, param_prefix="attend_2")
-        attention_weight = fluid.layers.matmul(seq1, seq2, transpose_y=True)
-        normalized_attention_weight = fluid.layers.softmax(attention_weight)
-        beta = fluid.layers.matmul(normalized_attention_weight, seq2)
-        attention_weight_t = fluid.layers.transpose(
-            attention_weight, perm=[0, 2, 1])
-        normalized_attention_weight_t = fluid.layers.softmax(attention_weight_t)
-        alpha = fluid.layers.matmul(normalized_attention_weight_t, seq1)
-        return alpha, beta
-
-    def compare(self, seq, soft_alignment, param_prefix):
-        concat_seq = fluid.layers.concat(input=[seq, soft_alignment], axis=2)
-        return self.feed_forward_2d(concat_seq, param_prefix="compare")
-
-    def aggregate(self, vec1, vec2):
-        vec1 = fluid.layers.reduce_sum(vec1, dim=1)
-        vec2 = fluid.layers.reduce_sum(vec2, dim=1)
-        concat_vec = fluid.layers.concat(input=[vec1, vec2], axis=1)
-        return self.feed_forward(concat_vec, param_prefix='aggregate')
diff --git a/PaddleRec/text_matching_on_quora/models/infer_sent.py b/PaddleRec/text_matching_on_quora/models/infer_sent.py
deleted file mode 100644
index 67de901bdd4d5f8ce1bcc911c2de87fe89f58033..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/models/infer_sent.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from .my_layers import bi_lstm_layer
-from .match_layers import ElementwiseMatching
-
-
-class InferSentNet():
-    """
-    Base on the paper: Supervised Learning of Universal Sentence Representations from Natural Language Inference Data:
-    https://arxiv.org/abs/1705.02364
-    """
-
-    def __init__(self, config):
-        self._config = config
-
-    def __call__(self, seq1, seq2, label):
-        return self.body(seq1, seq2, label, self._config)
-
-    def body(self, seq1, seq2, label, config):
-        """Body function"""
-
-        seq1_rnn = self.encoder(seq1)
-        seq2_rnn = self.encoder(seq2)
-        seq_match = ElementwiseMatching(seq1_rnn, seq2_rnn)
-
-        mlp_res = self.MLP(seq_match)
-        prediction = fluid.layers.fc(mlp_res,
-                                     size=self._config.class_dim,
-                                     act='softmax')
-        loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_cost = fluid.layers.mean(x=loss)
-        acc = fluid.layers.accuracy(input=prediction, label=label)
-        return avg_cost, acc, prediction
-
-    def encoder(self, seq):
-        """encoder"""
-
-        embed = fluid.layers.embedding(
-            input=seq,
-            size=[self._config.dict_dim, self._config.emb_dim],
-            param_attr=fluid.ParamAttr(
-                name='emb.w', trainable=self._config.word_embedding_trainable))
-
-        bi_lstm_h = bi_lstm_layer(
-            embed, rnn_hid_dim=self._config.rnn_hid_dim, name='encoder')
-
-        bi_lstm_h = fluid.layers.dropout(
-            bi_lstm_h, dropout_prob=self._config.droprate_lstm)
-        pool = fluid.layers.sequence_pool(input=bi_lstm_h, pool_type='max')
-        return pool
-
-    def MLP(self, vec):
-        if self._config.mlp_non_linear:
-            drop1 = fluid.layers.dropout(
-                vec, dropout_prob=self._config.droprate_fc)
-            fc1 = fluid.layers.fc(drop1, size=512, act='tanh')
-            drop2 = fluid.layers.dropout(
-                fc1, dropout_prob=self._config.droprate_fc)
-            fc2 = fluid.layers.fc(drop2, size=512, act='tanh')
-            res = fluid.layers.dropout(
-                fc2, dropout_prob=self._config.droprate_fc)
-        else:
-            fc1 = fluid.layers.fc(vec, size=512, act=None)
-            res = fluid.layers.fc(fc1, size=512, act=None)
-        return res
diff --git a/PaddleRec/text_matching_on_quora/models/match_layers.py b/PaddleRec/text_matching_on_quora/models/match_layers.py
deleted file mode 100755
index 314d5b2c290d5befb7db12a1d31a6e33fc70e3b8..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/models/match_layers.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This Module provide different kinds of Match layers
-"""
-
-import paddle.fluid as fluid
-
-
-def MultiPerspectiveMatching(vec1, vec2, perspective_num):
-    """
-    MultiPerspectiveMatching
-    """
-    sim_res = None
-    for i in range(perspective_num):
-        vec1_res = fluid.layers.elementwise_add_with_weight(
-            vec1, param_attr="elementwise_add_with_weight." + str(i))
-        vec2_res = fluid.layers.elementwise_add_with_weight(
-            vec2, param_attr="elementwise_add_with_weight." + str(i))
-        m = fluid.layers.cos_sim(vec1_res, vec2_res)
-        if sim_res is None:
-            sim_res = m
-        else:
-            sim_res = fluid.layers.concat(input=[sim_res, m], axis=1)
-    return sim_res
-
-
-def ConcateMatching(vec1, vec2):
-    """
-    ConcateMatching
-    """
-    #TODO: assert shape
-    return fluid.layers.concat(input=[vec1, vec2], axis=1)
-
-
-def ElementwiseMatching(vec1, vec2):
-    """
-    reference: [Supervised Learning of Universal Sentence Representations from Natural Language Inference Data](https://arxiv.org/abs/1705.02364)
-    """
-    elementwise_mul = fluid.layers.elementwise_mul(x=vec1, y=vec2)
-    elementwise_sub = fluid.layers.elementwise_sub(x=vec1, y=vec2)
-    elementwise_abs_sub = fluid.layers.abs(elementwise_sub)
-    return fluid.layers.concat(
-        input=[vec1, vec2, elementwise_mul, elementwise_abs_sub], axis=1)
diff --git a/PaddleRec/text_matching_on_quora/models/my_layers.py b/PaddleRec/text_matching_on_quora/models/my_layers.py
deleted file mode 100755
index 374d798209920dbd20f447269a62e57f38523d0a..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/models/my_layers.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module defines some Frequently-used DNN layers
-"""
-
-import paddle.fluid as fluid
-
-
-def bi_lstm_layer(input, rnn_hid_dim, name):
-    """
-    This is a Bi-directional LSTM(long short term memory) Module
-    """
-    fc0 = fluid.layers.fc(
-        input=input,  # fc for lstm
-        size=rnn_hid_dim * 4,
-        param_attr=name + '.fc0.w',
-        bias_attr=False,
-        act=None)
-
-    lstm_h, c = fluid.layers.dynamic_lstm(
-        input=fc0,
-        size=rnn_hid_dim * 4,
-        is_reverse=False,
-        param_attr=name + '.lstm_w',
-        bias_attr=name + '.lstm_b')
-
-    reversed_lstm_h, reversed_c = fluid.layers.dynamic_lstm(
-        input=fc0,
-        size=rnn_hid_dim * 4,
-        is_reverse=True,
-        param_attr=name + '.reversed_lstm_w',
-        bias_attr=name + '.reversed_lstm_b')
-    return fluid.layers.concat(input=[lstm_h, reversed_lstm_h], axis=1)
diff --git a/PaddleRec/text_matching_on_quora/models/pwim.py b/PaddleRec/text_matching_on_quora/models/pwim.py
deleted file mode 100644
index 7b60ec4823f845cdcd694ae6d1c617a0eeda1909..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/models/pwim.py
+++ /dev/null
@@ -1,14 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Just for test `git push`
diff --git a/PaddleRec/text_matching_on_quora/models/sse.py b/PaddleRec/text_matching_on_quora/models/sse.py
deleted file mode 100644
index 621f442528eb038457c4f4d99ef47c676a11ad6e..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/models/sse.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.fluid as fluid
-from .my_layers import bi_lstm_layer
-from .match_layers import ElementwiseMatching
-
-
-class SSENet():
-    """
-    SSE net: Shortcut-Stacked Sentence Encoders for Multi-Domain Inference
-    https://arxiv.org/abs/1708.02312
-    """
-
-    def __init__(self, config):
-        self._config = config
-
-    def __call__(self, seq1, seq2, label):
-        return self.body(seq1, seq2, label, self._config)
-
-    def body(self, seq1, seq2, label, config):
-        """Body function"""
-
-        def stacked_bi_rnn_model(seq):
-            embed = fluid.layers.embedding(
-                input=seq,
-                size=[self._config.dict_dim, self._config.emb_dim],
-                param_attr='emb.w')
-            stacked_lstm_out = [embed]
-            for i in range(len(self._config.rnn_hid_dim)):
-                if i == 0:
-                    feature = embed
-                else:
-                    feature = fluid.layers.concat(
-                        input=stacked_lstm_out, axis=1)
-                bi_lstm_h = bi_lstm_layer(
-                    feature,
-                    rnn_hid_dim=self._config.rnn_hid_dim[i],
-                    name="lstm_" + str(i))
-
-                # add dropout except for the last stacked lstm layer
-                if i != len(self._config.rnn_hid_dim) - 1:
-                    bi_lstm_h = fluid.layers.dropout(
-                        bi_lstm_h, dropout_prob=self._config.droprate_lstm)
-                stacked_lstm_out.append(bi_lstm_h)
-            pool = fluid.layers.sequence_pool(input=bi_lstm_h, pool_type='max')
-            return pool
-
-        def MLP(vec):
-            for i in range(len(self._config.fc_dim)):
-                vec = fluid.layers.fc(vec,
-                                      size=self._config.fc_dim[i],
-                                      act='relu')
-                # add dropout after every layer of MLP
-                vec = fluid.layers.dropout(
-                    vec, dropout_prob=self._config.droprate_fc)
-            return vec
-
-        seq1_rnn = stacked_bi_rnn_model(seq1)
-        seq2_rnn = stacked_bi_rnn_model(seq2)
-        seq_match = ElementwiseMatching(seq1_rnn, seq2_rnn)
-
-        mlp_res = MLP(seq_match)
-        prediction = fluid.layers.fc(mlp_res,
-                                     size=self._config.class_dim,
-                                     act='softmax')
-        loss = fluid.layers.cross_entropy(input=prediction, label=label)
-        avg_cost = fluid.layers.mean(x=loss)
-        acc = fluid.layers.accuracy(input=prediction, label=label)
-        return avg_cost, acc, prediction
diff --git a/PaddleRec/text_matching_on_quora/models/test.py b/PaddleRec/text_matching_on_quora/models/test.py
deleted file mode 100644
index 33ed0ecf10ec4cad807ebb6df1590de65eeeab1e..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/models/test.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/PaddleRec/text_matching_on_quora/pretrained_word2vec.py b/PaddleRec/text_matching_on_quora/pretrained_word2vec.py
deleted file mode 100755
index a6df805533bdb141af5c6665a4fb95762bf659c8..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/pretrained_word2vec.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This Module provide pretrained word-embeddings 
-"""
-
-from __future__ import print_function, unicode_literals
-import numpy as np
-import time, datetime
-import os, sys
-
-
-def maybe_open(filepath):
-    if sys.version_info <= (3, 0):  # for python2
-        return open(filepath, 'r')
-    else:
-        return open(filepath, 'r', encoding="utf-8")
-
-
-def Glove840B_300D(filepath, keys=None):
-    """
-    input: the "glove.840B.300d.txt" file path
-    return: a dict, key: word (unicode), value: a numpy array with shape [300]
-    """
-    if keys is not None:
-        assert (isinstance(keys, set))
-    print("loading word2vec from ", filepath)
-    print("please wait for a minute.")
-    start = time.time()
-    word2vec = {}
-    with maybe_open(filepath) as f:
-        for line in f:
-            if sys.version_info <= (3, 0):  # for python2
-                line = line.decode('utf-8')
-            info = line.strip("\n").split(" ")
-            word = info[0]
-            if (keys is not None) and (word not in keys):
-                continue
-            vector = info[1:]
-            assert (len(vector) == 300)
-            word2vec[word] = np.asarray(vector, dtype='float32')
-
-    end = time.time()
-    print(
-        "Spent ",
-        str(datetime.timedelta(seconds=end - start)),
-        " on loading word2vec.")
-    return word2vec
-
-
-if __name__ == '__main__':
-    from os.path import expanduser
-    home = expanduser("~")
-    embed_dict = Glove840B_300D(
-        os.path.join(home, "./.cache/paddle/dataset/glove.840B.300d.txt"))
-    exit(0)
diff --git a/PaddleRec/text_matching_on_quora/quora_question_pairs.py b/PaddleRec/text_matching_on_quora/quora_question_pairs.py
deleted file mode 100755
index e21742aedb4bb0a5211ac00e78904a9d856cb74c..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/quora_question_pairs.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-"""
-
-import paddle.dataset.common
-import collections
-import tarfile
-import re
-import string
-import random
-import os, sys
-import nltk
-from os.path import expanduser
-
-__all__ = ['word_dict', 'train', 'dev', 'test']
-
-URL = "https://drive.google.com/file/d/0B0PlTAo--BnaQWlsZl9FZ3l1c28/view"
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
-DATA_DIR = "Quora_question_pair_partition"
-
-QUORA_TRAIN_FILE_NAME = os.path.join(DATA_HOME, DATA_DIR, 'train.tsv')
-QUORA_DEV_FILE_NAME = os.path.join(DATA_HOME, DATA_DIR, 'dev.tsv')
-QUORA_TEST_FILE_NAME = os.path.join(DATA_HOME, DATA_DIR, 'test.tsv')
-
-# punctuation or nltk or space
-TOKENIZE_METHOD = 'space'
-
-COLUMN_COUNT = 4
-
-
-def tokenize(s):
-    if sys.version_info <= (3, 0):  # for python2
-        s = s.decode('utf-8')
-    if TOKENIZE_METHOD == "nltk":
-        return nltk.tokenize.word_tokenize(s)
-    elif TOKENIZE_METHOD == "punctuation":
-        return s.translate({ord(char): None
-                            for char in string.punctuation}).lower().split()
-    elif TOKENIZE_METHOD == "space":
-        return s.split()
-    else:
-        raise RuntimeError("Invalid tokenize method")
-
-
-def maybe_open(file_name):
-    if not os.path.isfile(file_name):
-        msg = "file not exist: %s\nPlease download the dataset firstly from: %s\n\n" % (file_name, URL) + \
-                ("# The finally dataset dir should be like\n\n"
-                "$HOME/.cache/paddle/dataset\n"
-                " |- Quora_question_pair_partition\n"
-                "     |- train.tsv\n"
-                "     |- test.tsv\n"
-                "     |- dev.tsv\n"
-                "     |- readme.txt\n"
-                "     |- wordvec.txt\n")
-        raise RuntimeError(msg)
-    if sys.version_info <= (3, 0):  # for python2
-        return open(file_name, 'r')
-    else:
-        return open(file_name, 'r', encoding="utf-8")
-
-
-def tokenized_question_pairs(file_name):
-    """
-    """
-    with maybe_open(file_name) as f:
-        questions = {}
-        lines = f.readlines()
-        for line in lines:
-            info = line.strip().split('\t')
-            if len(info) != COLUMN_COUNT:
-                # formatting error
-                continue
-            (label, question1, question2, id) = info
-            question1 = tokenize(question1)
-            question2 = tokenize(question2)
-            yield question1, question2, int(label)
-
-
-def tokenized_questions(file_name):
-    """
-    """
-    with maybe_open(file_name) as f:
-        lines = f.readlines()
-        for line in lines:
-            info = line.strip().split('\t')
-            if len(info) != COLUMN_COUNT:
-                # formatting error
-                continue
-            (label, question1, question2, id) = info
-            yield tokenize(question1)
-            yield tokenize(question2)
-
-
-def build_dict(file_name, cutoff):
-    """
-    Build a word dictionary from the corpus. Keys of the dictionary are words,
-    and values are zero-based IDs of these words.
-    """
-    word_freq = collections.defaultdict(int)
-    for doc in tokenized_questions(file_name):
-        for word in doc:
-            word_freq[word] += 1
-
-    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
-
-    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
-    words, _ = list(zip(*dictionary))
-    word_idx = dict(zip(words, range(len(words))))
-    word_idx['<unk>'] = len(words)
-    word_idx['<pad>'] = len(words) + 1
-    return word_idx
-
-
-def reader_creator(file_name, word_idx):
-    UNK_ID = word_idx['<unk>']
-
-    def reader():
-        for (q1, q2, label) in tokenized_question_pairs(file_name):
-            q1_ids = [word_idx.get(w, UNK_ID) for w in q1]
-            q2_ids = [word_idx.get(w, UNK_ID) for w in q2]
-            if q1_ids != [] and q2_ids != []:  # [] is not allowed in fluid
-                assert (label in [0, 1])
-                yield q1_ids, q2_ids, label
-
-    return reader
-
-
-def train(word_idx):
-    """
-    Quora training set creator.
-
-    It returns a reader creator, each sample in the reader is two zero-based ID
-    list and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: Training reader creator
-    :rtype: callable
-    """
-    return reader_creator(QUORA_TRAIN_FILE_NAME, word_idx)
-
-
-def dev(word_idx):
-    """
-    Quora develop set creator.
-
-    It returns a reader creator, each sample in the reader is two zero-based ID
-    list and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: develop reader creator
-    :rtype: callable
-
-    """
-    return reader_creator(QUORA_DEV_FILE_NAME, word_idx)
-
-
-def test(word_idx):
-    """
-    Quora test set creator.
-
-    It returns a reader creator, each sample in the reader is two zero-based ID
-    list and label in [0, 1].
-
-    :param word_idx: word dictionary
-    :type word_idx: dict
-    :return: Test reader creator
-    :rtype: callable
-    """
-    return reader_creator(QUORA_TEST_FILE_NAME, word_idx)
-
-
-def word_dict():
-    """
-    Build a word dictionary from the corpus.
-
-    :return: Word dictionary
-    :rtype: dict
-    """
-    return build_dict(file_name=QUORA_TRAIN_FILE_NAME, cutoff=4)
diff --git a/PaddleRec/text_matching_on_quora/train_and_evaluate.py b/PaddleRec/text_matching_on_quora/train_and_evaluate.py
deleted file mode 100755
index 303dd4156a874fa1debaf4cfbe6295c736954ba3..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/train_and_evaluate.py
+++ /dev/null
@@ -1,314 +0,0 @@
-#Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import os
-import sys
-import time
-import argparse
-import unittest
-import contextlib
-import numpy as np
-
-import paddle.fluid as fluid
-
-import utils, metric, configs
-import models
-
-from pretrained_word2vec import Glove840B_300D
-
-parser = argparse.ArgumentParser(description=__doc__)
-
-parser.add_argument(
-    '--model_name', type=str, default='cdssmNet', help="Which model to train")
-parser.add_argument(
-    '--config',
-    type=str,
-    default='cdssm_base',
-    help="The global config setting")
-parser.add_argument(
-    '--enable_ce',
-    action='store_true',
-    help='If set, run the task with continuous evaluation logs.')
-parser.add_argument('--epoch_num', type=int, help='Number of epoch')
-
-DATA_DIR = os.path.join(os.path.expanduser('~'), '.cache/paddle/dataset')
-
-
-def evaluate(epoch_id, exe, inference_program, dev_reader, test_reader,
-             fetch_list, feeder, metric_type):
-    """
-    evaluate on test/dev dataset
-    """
-
-    def infer(test_reader):
-        """
-        do inference function
-        """
-        total_cost = 0.0
-        total_count = 0
-        preds, labels = [], []
-        for data in test_reader():
-            avg_cost, avg_acc, batch_prediction = exe.run(
-                inference_program,
-                feed=feeder.feed(data),
-                fetch_list=fetch_list,
-                return_numpy=True)
-            total_cost += avg_cost * len(data)
-            total_count += len(data)
-            preds.append(batch_prediction)
-            labels.append(np.asarray([x[-1] for x in data], dtype=np.int64))
-        y_pred = np.concatenate(preds)
-        y_label = np.concatenate(labels)
-
-        metric_res = []
-        for metric_name in metric_type:
-            if metric_name == 'accuracy_with_threshold':
-                metric_res.append((metric_name, metric.accuracy_with_threshold(
-                    y_pred, y_label, threshold=0.3)))
-            elif metric_name == 'accuracy':
-                metric_res.append(
-                    (metric_name, metric.accuracy(y_pred, y_label)))
-            else:
-                print("Unknown metric type: ", metric_name)
-                exit()
-        return total_cost / (total_count * 1.0), metric_res
-
-    dev_cost, dev_metric_res = infer(dev_reader)
-    print("[%s] epoch_id: %d, dev_cost: %f, " % (time.asctime(
-        time.localtime(time.time())), epoch_id, dev_cost) + ', '.join(
-            [str(x[0]) + ": " + str(x[1]) for x in dev_metric_res]))
-
-    test_cost, test_metric_res = infer(test_reader)
-    print("[%s] epoch_id: %d, test_cost: %f, " % (time.asctime(
-        time.localtime(time.time())), epoch_id, test_cost) + ', '.join(
-            [str(x[0]) + ": " + str(x[1]) for x in test_metric_res]))
-    print("")
-
-
-def train_and_evaluate(train_reader, dev_reader, test_reader, network,
-                       optimizer, global_config, pretrained_word_embedding,
-                       use_cuda, parallel):
-    """
-    train network
-    """
-
-    # define the net
-    if global_config.use_lod_tensor:
-        # automatic add batch dim
-        q1 = fluid.layers.data(
-            name="question1", shape=[1], dtype="int64", lod_level=1)
-        q2 = fluid.layers.data(
-            name="question2", shape=[1], dtype="int64", lod_level=1)
-        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-        cost, acc, prediction = network(q1, q2, label)
-    else:
-        # shape: [batch_size, max_seq_len_in_batch, 1]
-        q1 = fluid.layers.data(
-            name="question1", shape=[-1, -1, 1], dtype="int64")
-        q2 = fluid.layers.data(
-            name="question2", shape=[-1, -1, 1], dtype="int64")
-        # shape: [batch_size, max_seq_len_in_batch]
-        mask1 = fluid.layers.data(name="mask1", shape=[-1, -1], dtype="float32")
-        mask2 = fluid.layers.data(name="mask2", shape=[-1, -1], dtype="float32")
-        label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-        cost, acc, prediction = network(q1, q2, mask1, mask2, label)
-
-    if parallel:
-        # TODO: Paarallel Training
-        print("Parallel Training is not supported for now.")
-        sys.exit(1)
-
-    #optimizer.minimize(cost)
-    if use_cuda:
-        print("Using GPU")
-        place = fluid.CUDAPlace(0)
-    else:
-        print("Using CPU")
-        place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-
-    if global_config.use_lod_tensor:
-        feeder = fluid.DataFeeder(feed_list=[q1, q2, label], place=place)
-    else:
-        feeder = fluid.DataFeeder(
-            feed_list=[q1, q2, mask1, mask2, label], place=place)
-
-    # only for ce
-    args = parser.parse_args()
-    if args.enable_ce:
-        SEED = 102
-        fluid.default_startup_program().random_seed = SEED
-        fluid.default_main_program().random_seed = SEED
-
-    # logging param info
-    for param in fluid.default_main_program().global_block().all_parameters():
-        print("param name: %s; param shape: %s" % (param.name, param.shape))
-
-    # define inference_program
-    inference_program = fluid.default_main_program().clone(for_test=True)
-
-    optimizer.minimize(cost)
-
-    exe.run(fluid.default_startup_program())
-
-    # load emb from a numpy erray
-    if pretrained_word_embedding is not None:
-        print("loading pretrained word embedding to param")
-        embedding_name = "emb.w"
-        embedding_param = fluid.global_scope().find_var(
-            embedding_name).get_tensor()
-        embedding_param.set(pretrained_word_embedding, place)
-
-    evaluate(
-        -1,
-        exe,
-        inference_program,
-        dev_reader,
-        test_reader,
-        fetch_list=[cost, acc, prediction],
-        feeder=feeder,
-        metric_type=global_config.metric_type)
-
-    # start training
-    total_time = 0.0
-    print("[%s] Start Training" % time.asctime(time.localtime(time.time())))
-    for epoch_id in range(global_config.epoch_num):
-
-        data_size, data_count, total_acc, total_cost = 0, 0, 0.0, 0.0
-        batch_id = 0
-        epoch_begin_time = time.time()
-        for data in train_reader():
-            avg_cost_np, avg_acc_np = exe.run(fluid.default_main_program(),
-                                              feed=feeder.feed(data),
-                                              fetch_list=[cost, acc])
-            data_size = len(data)
-            total_acc += data_size * avg_acc_np[0]
-            total_cost += data_size * avg_cost_np[0]
-            data_count += data_size
-            if batch_id % 100 == 0:
-                print("[%s] epoch_id: %d, batch_id: %d, cost: %f, acc: %f" %
-                      (time.asctime(time.localtime(time.time())), epoch_id,
-                       batch_id, avg_cost_np, avg_acc_np))
-            batch_id += 1
-        avg_cost = total_cost / data_count
-        avg_acc = total_acc / data_count
-        epoch_end_time = time.time()
-        total_time += epoch_end_time - epoch_begin_time
-
-        print("")
-        print(
-            "[%s] epoch_id: %d, train_avg_cost: %f, train_avg_acc: %f, epoch_time_cost: %f"
-            % (time.asctime(time.localtime(time.time())), epoch_id, avg_cost,
-               avg_acc, time.time() - epoch_begin_time))
-
-        # only for ce
-        if epoch_id == global_config.epoch_num - 1 and args.enable_ce:
-            #Note: The following logs are special for CE monitoring.
-            #Other situations do not need to care about these logs.
-            gpu_num = get_cards(args)
-            print("kpis\teach_pass_duration_card%s\t%s" % \
-                  (gpu_num, total_time / (global_config.epoch_num)))
-            print("kpis\ttrain_avg_cost_card%s\t%s" % (gpu_num, avg_cost))
-            print("kpis\ttrain_avg_acc_card%s\t%s" % (gpu_num, avg_acc))
-
-        epoch_model = global_config.save_dirname + "/" + "epoch" + str(epoch_id)
-        fluid.io.save_inference_model(
-            epoch_model, ["question1", "question2", "label"], acc, exe)
-
-        evaluate(
-            epoch_id,
-            exe,
-            inference_program,
-            dev_reader,
-            test_reader,
-            fetch_list=[cost, acc, prediction],
-            feeder=feeder,
-            metric_type=global_config.metric_type)
-
-
-def main():
-    """
-    This function will parse argments, prepare data and prepare pretrained embedding
-    """
-    args = parser.parse_args()
-    global_config = configs.__dict__[args.config]()
-
-    if args.epoch_num != None:
-        global_config.epoch_num = args.epoch_num
-
-    print("net_name: ", args.model_name)
-    net = models.__dict__[args.model_name](global_config)
-
-    # get word_dict
-    word_dict = utils.getDict(data_type="quora_question_pairs")
-
-    # get reader
-    train_reader, dev_reader, test_reader = utils.prepare_data(
-        "quora_question_pairs",
-        word_dict=word_dict,
-        batch_size=global_config.batch_size,
-        buf_size=800000,
-        duplicate_data=global_config.duplicate_data,
-        use_pad=(not global_config.use_lod_tensor))
-
-    # load pretrained_word_embedding
-    if global_config.use_pretrained_word_embedding:
-        word2vec = Glove840B_300D(
-            filepath=os.path.join(DATA_DIR, "glove.840B.300d.txt"),
-            keys=set(word_dict.keys()))
-        pretrained_word_embedding = utils.get_pretrained_word_embedding(
-            word2vec=word2vec, word2id=word_dict, config=global_config)
-        print("pretrained_word_embedding to be load:",
-              pretrained_word_embedding)
-    else:
-        pretrained_word_embedding = None
-
-    # define optimizer
-    optimizer = utils.getOptimizer(global_config)
-
-    # use cuda or not
-    if not global_config.has_member('use_cuda'):
-        if 'CUDA_VISIBLE_DEVICES' in os.environ and os.environ[
-                'CUDA_VISIBLE_DEVICES'] != '':
-            global_config.use_cuda = True
-        else:
-            global_config.use_cuda = False
-
-    global_config.list_config()
-
-    train_and_evaluate(
-        train_reader,
-        dev_reader,
-        test_reader,
-        net,
-        optimizer,
-        global_config,
-        pretrained_word_embedding,
-        use_cuda=global_config.use_cuda,
-        parallel=False)
-
-
-def get_cards(args):
-    if args.enable_ce:
-        cards = os.environ.get('CUDA_VISIBLE_DEVICES')
-        num = len(cards.split(","))
-        return num
-    else:
-        return args.num_devices
-
-
-if __name__ == "__main__":
-    main()
diff --git a/PaddleRec/text_matching_on_quora/utils.py b/PaddleRec/text_matching_on_quora/utils.py
deleted file mode 100755
index 71df50039b7d20091643211f19b3dbdc9421d746..0000000000000000000000000000000000000000
--- a/PaddleRec/text_matching_on_quora/utils.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This module provides utilities for data generator and optimizer definition 
-"""
-
-import sys
-import time
-import numpy as np
-
-import paddle.fluid as fluid
-import paddle
-import quora_question_pairs
-
-
-def to_lodtensor(data, place):
-    """
-    convert to LODtensor
-    """
-    seq_lens = [len(seq) for seq in data]
-    cur_len = 0
-    lod = [cur_len]
-    for l in seq_lens:
-        cur_len += l
-        lod.append(cur_len)
-    flattened_data = np.concatenate(data, axis=0).astype("int64")
-    flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = fluid.LoDTensor()
-    res.set(flattened_data, place)
-    res.set_lod([lod])
-    return res
-
-
-def getOptimizer(global_config):
-    """
-    get Optimizer by config
-    """
-    if global_config.optimizer_type == "adam":
-        optimizer = fluid.optimizer.Adam(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=global_config.learning_rate,
-                decay_steps=global_config.train_samples_num //
-                global_config.batch_size,
-                decay_rate=global_config.lr_decay))
-    elif global_config.optimizer_type == "sgd":
-        optimizer = fluid.optimizer.SGD(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=global_config.learning_rate,
-                decay_steps=global_config.train_samples_num //
-                global_config.batch_size,
-                decay_rate=global_config.lr_decay))
-
-    elif global_config.optimizer_type == "adagrad":
-        optimizer = fluid.optimizer.Adagrad(
-            learning_rate=fluid.layers.exponential_decay(
-                learning_rate=global_config.learning_rate,
-                decay_steps=global_config.train_samples_num //
-                global_config.batch_size,
-                decay_rate=global_config.lr_decay))
-
-    return optimizer
-
-
-def get_pretrained_word_embedding(word2vec, word2id, config):
-    """get pretrained embedding in shape [config.dict_dim, config.emb_dim]"""
-    print("preparing pretrained word embedding ...")
-    assert (config.dict_dim >= len(word2id))
-    word2id = sorted(word2id.items(), key=lambda x: x[1])
-    words = [x[0] for x in word2id]
-    words = words + ['<not-a-real-words>'] * (config.dict_dim - len(words))
-    pretrained_emb = []
-    for _, word in enumerate(words):
-        if word in word2vec:
-            assert (len(word2vec[word] == config.emb_dim))
-            if config.embedding_norm:
-                pretrained_emb.append(word2vec[word] /
-                                      np.linalg.norm(word2vec[word]))
-            else:
-                pretrained_emb.append(word2vec[word])
-        elif config.OOV_fill == 'uniform':
-            pretrained_emb.append(
-                np.random.uniform(
-                    -0.05, 0.05, size=[config.emb_dim]).astype(np.float32))
-        elif config.OOV_fill == 'normal':
-            pretrained_emb.append(
-                np.random.normal(
-                    loc=0.0, scale=0.1, size=[config.emb_dim]).astype(
-                        np.float32))
-        else:
-            print("Unkown OOV fill method: ", OOV_fill)
-            exit()
-    word_embedding = np.stack(pretrained_emb)
-    return word_embedding
-
-
-def getDict(data_type="quora_question_pairs"):
-    """
-    get word2id dict from quora dataset
-    """
-    print("Generating word dict...")
-    if data_type == "quora_question_pairs":
-        word_dict = quora_question_pairs.word_dict()
-    else:
-        raise RuntimeError("No such dataset")
-    print("Vocab size: ", len(word_dict))
-    return word_dict
-
-
-def duplicate(reader):
-    """
-    duplicate the quora qestion pairs since there are 2 questions in a sample
-    Input: reader, which yield (question1, question2, label)
-    Output: reader, which yield (question1, question2, label) and yield (question2, question1, label)
-    """
-
-    def duplicated_reader():
-        for data in reader():
-            (q1, q2, label) = data
-            yield (q1, q2, label)
-            yield (q2, q1, label)
-
-    return duplicated_reader
-
-
-def pad(reader, PAD_ID):
-    """
-    Input: reader, yield batches of [(question1, question2, label), ... ]
-    Output: padded_reader, yield batches of [(padded_question1, padded_question2, mask1, mask2, label), ... ]
-    """
-
-    assert (isinstance(PAD_ID, int))
-
-    def padded_reader():
-        for batch in reader():
-            max_len1 = max([len(data[0]) for data in batch])
-            max_len2 = max([len(data[1]) for data in batch])
-
-            padded_batch = []
-            for data in batch:
-                question1, question2, label = data
-                seq_len1 = len(question1)
-                seq_len2 = len(question2)
-                mask1 = [1] * seq_len1 + [0] * (max_len1 - seq_len1)
-                mask2 = [1] * seq_len2 + [0] * (max_len2 - seq_len2)
-                padded_question1 = question1 + [PAD_ID] * (max_len1 - seq_len1)
-                padded_question2 = question2 + [PAD_ID] * (max_len2 - seq_len2)
-                padded_question1 = [
-                    [x] for x in padded_question1
-                ]  # last dim of questions must be 1, according to fluid's request
-                padded_question2 = [[x] for x in padded_question2]
-                assert (len(mask1) == max_len1)
-                assert (len(mask2) == max_len2)
-                assert (len(padded_question1) == max_len1)
-                assert (len(padded_question2) == max_len2)
-                padded_batch.append(
-                    (padded_question1, padded_question2, mask1, mask2, label))
-            yield padded_batch
-
-    return padded_reader
-
-
-def prepare_data(data_type,
-                 word_dict,
-                 batch_size,
-                 buf_size=50000,
-                 duplicate_data=False,
-                 use_pad=False):
-    """
-    prepare data
-    """
-
-    PAD_ID = word_dict['<pad>']
-
-    if data_type == "quora_question_pairs":
-        # train/dev/test reader are batched iters which yield a batch of (question1, question2, label) each time
-        # qestion1 and question2 are lists of word ID
-        # label is 0 or 1
-        # for example: ([1, 3, 2], [7, 5, 4, 99], 1)
-
-        def prepare_reader(reader):
-            if duplicate_data:
-                reader = duplicate(reader)
-            reader = paddle.batch(
-                paddle.reader.shuffle(
-                    reader, buf_size=buf_size),
-                batch_size=batch_size,
-                drop_last=False)
-            if use_pad:
-                reader = pad(reader, PAD_ID=PAD_ID)
-            return reader
-
-        train_reader = prepare_reader(quora_question_pairs.train(word_dict))
-        dev_reader = prepare_reader(quora_question_pairs.dev(word_dict))
-        test_reader = prepare_reader(quora_question_pairs.test(word_dict))
-
-    else:
-        raise RuntimeError("no such dataset")
-
-    return train_reader, dev_reader, test_reader
diff --git a/PaddleRec/word2vec/README.md b/PaddleRec/word2vec/README.md
index eae86615e56a2687ba83315dbc8874ba4fbcd765..bdff6ea1c5055c8867bc73237779489fb7217aed 100644
--- a/PaddleRec/word2vec/README.md
+++ b/PaddleRec/word2vec/README.md
@@ -4,8 +4,6 @@
 
 ```text
 .
-├── cluster_train.py    # 分布式训练函数
-├── cluster_train.sh    # 本地模拟多机脚本
 ├── train.py            # 训练函数
 ├── infer.py            # 预测脚本
 ├── net.py              # 网络结构
@@ -97,11 +95,6 @@ python train.py -h
 OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 --dict_path data/test_build_dict --num_passes 10 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse
 ```
 
-本地单机模拟多机训练, 目前暂不支持windows。
-
-```bash
-sh cluster_train.sh
-```
 若需要开启shuffle_batch功能，需在命令中加入`--with_shuffle_batch`。单机模拟分布式多机训练，需更改`cluster_train.sh`文件，在各个节点的启动命令中加入`--with_shuffle_batch`。
 
 ## 预测
diff --git a/PaddleRec/word2vec/cluster_train.py b/PaddleRec/word2vec/cluster_train.py
deleted file mode 100644
index 11054ce3231b1e400d0d683566e538dd2abd828c..0000000000000000000000000000000000000000
--- a/PaddleRec/word2vec/cluster_train.py
+++ /dev/null
@@ -1,264 +0,0 @@
-from __future__ import print_function
-import argparse
-import logging
-import os
-import time
-import math
-import random
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-import six
-import reader
-from net import skip_gram_word2vec, skip_gram_word2vec_shuffle_batch
-
-logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s')
-logger = logging.getLogger("fluid")
-logger.setLevel(logging.INFO)
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="PaddlePaddle Word2vec example")
-    parser.add_argument(
-        '--train_data_dir',
-        type=str,
-        default='./data/text',
-        help="The path of taining dataset")
-    parser.add_argument(
-        '--base_lr',
-        type=float,
-        default=0.01,
-        help="The number of learing rate (default: 0.01)")
-    parser.add_argument(
-        '--save_step',
-        type=int,
-        default=500000,
-        help="The number of step to save (default: 500000)")
-    parser.add_argument(
-        '--print_batch',
-        type=int,
-        default=100,
-        help="The number of print_batch (default: 10)")
-    parser.add_argument(
-        '--dict_path',
-        type=str,
-        default='./data/1-billion_dict',
-        help="The path of data dict")
-    parser.add_argument(
-        '--batch_size',
-        type=int,
-        default=500,
-        help="The size of mini-batch (default:500)")
-    parser.add_argument(
-        '--num_passes',
-        type=int,
-        default=10,
-        help="The number of passes to train (default: 10)")
-    parser.add_argument(
-        '--model_output_dir',
-        type=str,
-        default='models',
-        help='The path for model to store (default: models)')
-    parser.add_argument('--nce_num', type=int, default=5, help='nce_num')
-    parser.add_argument(
-        '--embedding_size',
-        type=int,
-        default=64,
-        help='sparse feature hashing space for index processing')
-    parser.add_argument(
-        '--is_sparse',
-        action='store_true',
-        required=False,
-        default=False,
-        help='embedding and nce will use sparse or not, (default: False)')
-    parser.add_argument(
-        '--with_speed',
-        action='store_true',
-        required=False,
-        default=False,
-        help='print speed or not , (default: False)')
-    parser.add_argument(
-        '--role', type=str, default='pserver', help='trainer or pserver')
-    parser.add_argument(
-        '--endpoints',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The pserver endpoints, like: 127.0.0.1:6000, 127.0.0.1:6001')
-    parser.add_argument(
-        '--current_endpoint',
-        type=str,
-        default='127.0.0.1:6000',
-        help='The current_endpoint')
-    parser.add_argument(
-        '--trainer_id',
-        type=int,
-        default=0,
-        help='trainer id ,only trainer_id=0 save model')
-    parser.add_argument(
-        '--trainers',
-        type=int,
-        default=1,
-        help='The num of trianers, (default: 1)')
-    parser.add_argument(
-        '--with_shuffle_batch',
-        action='store_true',
-        required=False,
-        default=False,
-        help='negative samples come from shuffle_batch op or not , (default: False)') 
-    return parser.parse_args()
-
-
-def convert_python_to_tensor(weight, batch_size, sample_reader):
-    def __reader__():
-        cs = np.array(weight).cumsum()
-        result = [[], []]
-        for sample in sample_reader():
-            for i, fea in enumerate(sample):
-                result[i].append(fea)
-            if len(result[0]) == batch_size:
-                tensor_result = []
-                for tensor in result:
-                    t = fluid.Tensor()
-                    dat = np.array(tensor, dtype='int64')
-                    if len(dat.shape) > 2:
-                        dat = dat.reshape((dat.shape[0], dat.shape[2]))
-                    elif len(dat.shape) == 1:
-                        dat = dat.reshape((-1, 1))
-                    t.set(dat, fluid.CPUPlace())
-                    tensor_result.append(t)
-                tt = fluid.Tensor()
-                neg_array = cs.searchsorted(np.random.sample(args.nce_num))
-                neg_array = np.tile(neg_array, batch_size)
-                tt.set(
-                    neg_array.reshape((batch_size, args.nce_num)),
-                    fluid.CPUPlace())
-                tensor_result.append(tt)
-                yield tensor_result
-                result = [[], []]
-
-    return __reader__
-
-
-def train_loop(args, train_program, data_loader, loss, trainer_id):
-
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(fluid.default_startup_program())
-
-    print("CPU_NUM:" + str(os.getenv("CPU_NUM")))
-
-    train_exe = exe
-
-    for pass_id in range(args.num_passes):
-        data_loader.start()
-        time.sleep(10)
-        epoch_start = time.time()
-        batch_id = 0
-        start = time.time()
-        try:
-            while True:
-
-                loss_val = train_exe.run(fetch_list=[loss.name])
-                loss_val = np.mean(loss_val)
-
-                if batch_id % args.print_batch == 0:
-                    logger.info(
-                        "TRAIN --> pass: {} batch: {} loss: {} reader queue:{}".
-                        format(pass_id, batch_id,
-                               loss_val.mean(), data_loader.queue.size()))
-                if args.with_speed:
-                    if batch_id % 500 == 0 and batch_id != 0:
-                        elapsed = (time.time() - start)
-                        start = time.time()
-                        samples = 1001 * args.batch_size * int(
-                            os.getenv("CPU_NUM"))
-                        logger.info("Time used: {}, Samples/Sec: {}".format(
-                            elapsed, samples / elapsed))
-
-                if batch_id % args.save_step == 0 and batch_id != 0:
-                    model_dir = args.model_output_dir + '/pass-' + str(
-                        pass_id) + ('/batch-' + str(batch_id))
-                    if trainer_id == 0:
-                        fluid.save(fluid.default_main_program(), model_path=model_dir)
-                        print("model saved in %s" % model_dir)
-                batch_id += 1
-
-        except fluid.core.EOFException:
-            data_loader.reset()
-            epoch_end = time.time()
-            logger.info("Epoch: {0}, Train total expend: {1} ".format(
-                pass_id, epoch_end - epoch_start))
-            model_dir = args.model_output_dir + '/pass-' + str(pass_id)
-            if trainer_id == 0:
-                fluid.save(fluid.default_main_program(), model_path=model_dir)
-                print("model saved in %s" % model_dir)
-
-
-def GetFileList(data_path):
-    return os.listdir(data_path)
-
-
-def train(args):
-
-    if not os.path.isdir(args.model_output_dir) and args.trainer_id == 0:
-        os.mkdir(args.model_output_dir)
-
-    filelist = GetFileList(args.train_data_dir)
-    word2vec_reader = reader.Word2VecReader(args.dict_path, args.train_data_dir,
-                                            filelist, 0, 1)
-
-    logger.info("dict_size: {}".format(word2vec_reader.dict_size))
-   
-    if args.with_shuffle_batch:
-        loss, data_loader = skip_gram_word2vec_shuffle_batch(
-            word2vec_reader.dict_size,
-            args.embedding_size,
-            is_sparse=args.is_sparse,
-            neg_num=args.nce_num)
-        data_loader.set_sample_generator(word2vec_reader.train(), batch_size=args.batch_size, drop_last=True)
-    else:
-        np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
-        id_frequencys_pow = np_power / np_power.sum()
-
-        loss, data_loader = skip_gram_word2vec(
-            word2vec_reader.dict_size,
-            args.embedding_size,
-            is_sparse=args.is_sparse,
-            neg_num=args.nce_num)
-
-        data_loader.set_batch_generator(
-            convert_python_to_tensor(id_frequencys_pow, args.batch_size, word2vec_reader.train())) 
-
-    optimizer = fluid.optimizer.SGD(
-        learning_rate=fluid.layers.exponential_decay(
-            learning_rate=args.base_lr,
-            decay_steps=100000,
-            decay_rate=0.999,
-            staircase=True))
-
-    optimizer.minimize(loss)
-
-    logger.info("run dist training")
-
-    t = fluid.DistributeTranspiler()
-    t.transpile(
-        args.trainer_id, pservers=args.endpoints, trainers=args.trainers)
-    if args.role == "pserver":
-        print("run psever")
-        pserver_prog = t.get_pserver_program(args.current_endpoint)
-        pserver_startup = t.get_startup_program(args.current_endpoint,
-                                                pserver_prog)
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(pserver_startup)
-        exe.run(pserver_prog)
-    elif args.role == "trainer":
-        print("run trainer")
-        train_loop(args,
-                   t.get_trainer_program(), data_loader, loss,
-                   args.trainer_id)
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    train(args)
diff --git a/PaddleRec/word2vec/cluster_train.sh b/PaddleRec/word2vec/cluster_train.sh
deleted file mode 100644
index 756196fd41eeb52d5f43553664c824748ac83e4e..0000000000000000000000000000000000000000
--- a/PaddleRec/word2vec/cluster_train.sh
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/bin/bash
-
-#export GLOG_v=30
-#export GLOG_logtostderr=1
-
-# start pserver0
-export CPU_NUM=5 
-export FLAGS_rpc_deadline=3000000
-python cluster_train.py \
-    --train_data_dir data/convert_text8 \
-    --dict_path data/test_build_dict \
-    --batch_size 100 \
-    --model_output_dir dis_model \
-    --base_lr 1.0 \
-    --print_batch 1 \
-    --is_sparse \
-    --with_speed \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6000 \
-    --trainers 2 \
-    > pserver0.log 2>&1 &
-
-python cluster_train.py \
-    --train_data_dir data/convert_text8 \
-    --dict_path data/test_build_dict \
-    --batch_size 100 \
-    --model_output_dir dis_model \
-    --base_lr 1.0 \
-    --print_batch 1 \
-    --is_sparse \
-    --with_speed \
-    --role pserver \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --current_endpoint 127.0.0.1:6001 \
-    --trainers 2 \
-    > pserver1.log 2>&1 &
-
-# start trainer0
-python cluster_train.py \
-    --train_data_dir data/convert_text8 \
-    --dict_path data/test_build_dict \
-    --batch_size 100 \
-    --model_output_dir dis_model \
-    --base_lr 1.0 \
-    --print_batch 1000 \
-    --is_sparse \
-    --with_speed \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 0 \
-    > trainer0.log 2>&1 &
-# start trainer1
-python cluster_train.py \
-    --train_data_dir data/convert_text8 \
-    --dict_path data/test_build_dict \
-    --batch_size 100 \
-    --model_output_dir dis_model \
-    --base_lr 1.0 \
-    --print_batch 1000 \
-    --is_sparse \
-    --with_speed \
-    --role trainer \
-    --endpoints 127.0.0.1:6000,127.0.0.1:6001 \
-    --trainers 2 \
-    --trainer_id 1 \
-    > trainer1.log 2>&1 &
diff --git a/PaddleRec/word2vec/net.py b/PaddleRec/word2vec/net.py
index 3e2ab33b92dc2503e69bf21efecf7b1714a099a0..c71e1bb54b957a71adb8f4427c762ae43ce76325 100644
--- a/PaddleRec/word2vec/net.py
+++ b/PaddleRec/word2vec/net.py
@@ -20,7 +20,10 @@ import numpy as np
 import paddle.fluid as fluid
 
 
-def skip_gram_word2vec_shuffle_batch(dict_size, embedding_size, is_sparse=False, neg_num=5):
+def skip_gram_word2vec_shuffle_batch(dict_size,
+                                     embedding_size,
+                                     is_sparse=False,
+                                     neg_num=5):
 
     words = []
     input_word = fluid.data(name="input_word", shape=[None, 1], dtype='int64')
@@ -61,7 +64,8 @@ def skip_gram_word2vec_shuffle_batch(dict_size, embedding_size, is_sparse=False,
     # add shuffle_batch after embedding. 
     neg_emb_w_list = []
     for i in range(neg_num):
-        neg_emb_w_list.append(fluid.contrib.layers.shuffle_batch(true_emb_w))  # shuffle true_word
+        neg_emb_w_list.append(
+            fluid.contrib.layers.shuffle_batch(true_emb_w))  # shuffle true_word
     neg_emb_w = fluid.layers.concat(neg_emb_w_list, axis=0)
 
     neg_emb_w_re = fluid.layers.reshape(
@@ -69,7 +73,8 @@ def skip_gram_word2vec_shuffle_batch(dict_size, embedding_size, is_sparse=False,
 
     neg_emb_b_list = []
     for i in range(neg_num):
-        neg_emb_b_list.append(fluid.contrib.layers.shuffle_batch(true_emb_b))  # shuffle true_word
+        neg_emb_b_list.append(
+            fluid.contrib.layers.shuffle_batch(true_emb_b))  # shuffle true_word
     neg_emb_b = fluid.layers.concat(neg_emb_b_list, axis=0)
     neg_emb_b_vec = fluid.layers.reshape(neg_emb_b, shape=[-1, neg_num])
 
@@ -81,15 +86,20 @@ def skip_gram_word2vec_shuffle_batch(dict_size, embedding_size, is_sparse=False,
         true_emb_b)
     input_emb_re = fluid.layers.reshape(
         input_emb, shape=[-1, 1, embedding_size])
-    neg_matmul = fluid.layers.matmul(input_emb_re, neg_emb_w_re, transpose_y=True)
+    neg_matmul = fluid.layers.matmul(
+        input_emb_re, neg_emb_w_re, transpose_y=True)
     neg_matmul_re = fluid.layers.reshape(neg_matmul, shape=[-1, neg_num])
     neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec)
     #nce loss
 
-    label_ones = fluid.layers.fill_constant_batch_size_like(
-        true_logits, shape=[-1, 1], value=1.0, dtype='float32')
-    label_zeros = fluid.layers.fill_constant_batch_size_like(
-        true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
+    label_ones = fluid.layers.fill_constant(
+        shape=[fluid.layers.shape(true_logits)[0], 1],
+        value=1.0,
+        dtype='float32')
+    label_zeros = fluid.layers.fill_constant(
+        shape=[fluid.layers.shape(true_logits)[0], neg_num],
+        value=0.0,
+        dtype='float32')
 
     true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
                                                                label_ones)
@@ -103,6 +113,7 @@ def skip_gram_word2vec_shuffle_batch(dict_size, embedding_size, is_sparse=False,
     avg_cost = fluid.layers.reduce_mean(cost)
     return avg_cost, data_loader
 
+
 def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
 
     words = []
@@ -171,10 +182,14 @@ def skip_gram_word2vec(dict_size, embedding_size, is_sparse=False, neg_num=5):
     neg_logits = fluid.layers.elementwise_add(neg_matmul_re, neg_emb_b_vec)
     #nce loss
 
-    label_ones = fluid.layers.fill_constant_batch_size_like(
-        true_logits, shape=[-1, 1], value=1.0, dtype='float32')
-    label_zeros = fluid.layers.fill_constant_batch_size_like(
-        true_logits, shape=[-1, neg_num], value=0.0, dtype='float32')
+    label_ones = fluid.layers.fill_constant(
+        shape=[fluid.layers.shape(true_logits)[0], 1],
+        value=1.0,
+        dtype='float32')
+    label_zeros = fluid.layers.fill_constant(
+        shape=[fluid.layers.shape(true_logits)[0], neg_num],
+        value=0.0,
+        dtype='float32')
 
     true_xent = fluid.layers.sigmoid_cross_entropy_with_logits(true_logits,
                                                                label_ones)
diff --git a/PaddleRec/word2vec/train.py b/PaddleRec/word2vec/train.py
index df77c4bd72a5471025b62c5a84fe564bbbb987bd..239d6caa9ec096a576965b6bc0895f599f351381 100644
--- a/PaddleRec/word2vec/train.py
+++ b/PaddleRec/word2vec/train.py
@@ -221,7 +221,7 @@ def train(args):
             args.embedding_size,
             is_sparse=args.is_sparse,
             neg_num=args.nce_num)
-	data_loader.set_sample_generator(word2vec_reader.train(), batch_size=args.batch_size, drop_last=True)
+        data_loader.set_sample_generator(word2vec_reader.train(), batch_size=args.batch_size, drop_last=True)
     else:
         np_power = np.power(np.array(word2vec_reader.id_frequencys), 0.75)
         id_frequencys_pow = np_power / np_power.sum()
diff --git a/dygraph/bmn/BMN.png b/dygraph/bmn/BMN.png
new file mode 100644
index 0000000000000000000000000000000000000000..46b9743cf5b5e79ada93ebb6b61da96967f350a4
Binary files /dev/null and b/dygraph/bmn/BMN.png differ
diff --git a/dygraph/bmn/README.md b/dygraph/bmn/README.md
index 715c3cb7d77d014f10e68f3d32aab512aef21156..864b5cf1df667cf04dec57fd25a73df380eb38ff 100644
--- a/dygraph/bmn/README.md
+++ b/dygraph/bmn/README.md
@@ -17,7 +17,7 @@
 BMN模型是百度自研，2019年ActivityNet夺冠方案，为视频动作定位问题中proposal的生成提供高效的解决方案，在PaddlePaddle上首次开源。此模型引入边界匹配(Boundary-Matching, BM)机制来评估proposal的置信度，按照proposal开始边界的位置及其长度将所有可能存在的proposal组合成一个二维的BM置信度图，图中每个点的数值代表其所对应的proposal的置信度分数。网络由三个模块组成，基础模块作为主干网络处理输入的特征序列，TEM模块预测每一个时序位置属于动作开始、动作结束的概率，PEM模块生成BM置信度图。
 
 <p align="center">
-<img src="../../PaddleCV/PaddleVideo/images/BMN.png" height=300 width=500 hspace='10'/> <br />
+<img src="./BMN.png" height=300 width=500 hspace='10'/> <br />
 BMN Overview
 </p>
 
@@ -44,7 +44,7 @@ BMN模型的静态图实现请参考[PaddleVideo](../../PaddleCV/PaddleVideo)
 
 ## 数据准备
 
-BMN的训练数据采用ActivityNet1.3提供的数据集，我们提供了处理好的视频特征，请下载[bmn\_feat](https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz)数据后解压，同时相应的修改bmn.yaml中的特征路径feat\_path。
+BMN的训练数据采用ActivityNet1.3提供的数据集，我们提供了处理好的视频特征和对应的标签文件，请下载特征数据[bmn\_feat](https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz)和标签数据[label](https://paddlemodels.bj.bcebos.com/video_detection/activitynet_1.3_annotations.json)，并相应地修改配置文件bmn.yaml中的特征文件路径feat\_path和标签文件路径anno\_file
 
 
 ## 模型训练
@@ -55,7 +55,7 @@ BMN的训练数据采用ActivityNet1.3提供的数据集，我们提供了处理
 
     bash run.sh
 
-若使用单卡训练，启动方式如下:
+若使用单卡训练，请将配置文件bmn.yaml中`TRAIN`和`VALID`对应的batch\_size调整为16，启动方式如下:
 
     export CUDA_VISIBLE_DEVICES=0
     python train.py
diff --git a/dygraph/bmn/bmn.yaml b/dygraph/bmn/bmn.yaml
index dbcb2f649ffdb2e612b76abcb415dcc27e10bd61..315b470d90694a1b16ac78f2b832bd6ea42cb6e9 100644
--- a/dygraph/bmn/bmn.yaml
+++ b/dygraph/bmn/bmn.yaml
@@ -6,13 +6,13 @@ MODEL:
   prop_boundary_ratio: 0.5
   num_sample: 32
   num_sample_perbin: 3
-  anno_file: "../../PaddleCV/video/data/dataset/bmn/activitynet_1.3_annotations.json"
-  feat_path: './fix_feat_100'
+  anno_file: "./activitynet_1.3_annotations.json"
+  feat_path: "./fix_feat_100"
 
 TRAIN:
   subset: "train"
   epoch: 9
-  batch_size: 16
+  batch_size: 4
   num_threads: 8
   use_gpu: True
   num_gpus: 4
@@ -23,7 +23,7 @@ TRAIN:
 
 VALID:
   subset: "validation"
-  batch_size: 16
+  batch_size: 4
   num_threads: 8
   use_gpu: True
   num_gpus: 4
diff --git a/dygraph/ptb_lm/args.py b/dygraph/ptb_lm/args.py
index 6449b274542185dbb070fdfce0d14bd8138eeea9..4bd05089100892044a7660913bca6faa01bfd5a4 100644
--- a/dygraph/ptb_lm/args.py
+++ b/dygraph/ptb_lm/args.py
@@ -57,6 +57,12 @@ def parse_args():
         type=str,
         default=None,
         help='dir to init model.')
+    # NOTE: used for benchmark
+    parser.add_argument(
+        '--max_iter',
+        type=int,
+        default=0,
+        help='the max iters for train, used for benchmark.')
     parser.add_argument('--ce', action='store_true', help="run ce")
     args = parser.parse_args()
     return args
diff --git a/dygraph/ptb_lm/ptb_dy.py b/dygraph/ptb_lm/ptb_dy.py
index 973d4fde52931db404214c15d56a0032ea29e4de..d33e64194c33c5a4c7ddedbda405daa58fe330ae 100644
--- a/dygraph/ptb_lm/ptb_dy.py
+++ b/dygraph/ptb_lm/ptb_dy.py
@@ -374,6 +374,8 @@ def train_ptb_lm():
 
         ce_time = []
         ce_ppl = []
+        
+        total_batch_num = 0  #this is for benchmark
         for epoch_id in range(max_epoch):
             ptb_model.train()
             total_loss = 0.0
@@ -389,6 +391,9 @@ def train_ptb_lm():
             init_cell = to_variable(init_cell_data)
             start_time = time.time()
             for batch_id, batch in enumerate(train_data_iter):
+                if args.max_iter and total_batch_num == args.max_iter:
+                    return
+                batch_start = time.time()
                 x_data, y_data = batch
 
                 x_data = x_data.reshape((-1, num_steps, 1))
@@ -408,13 +413,16 @@ def train_ptb_lm():
 
                 ptb_model.clear_gradients()
                 total_loss += out_loss
+                batch_end = time.time()
+                train_batch_cost = batch_end - batch_start
                 iters += num_steps
+                total_batch_num = total_batch_num + 1 #this is for benchmark
 
                 if batch_id > 0 and batch_id % log_interval == 0:
                     ppl = np.exp(total_loss / iters)
-                    print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f" %
+                    print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f, batch cost: %.5f" %
                           (epoch_id, batch_id, ppl[0],
-                           sgd._global_learning_rate().numpy(), out_loss))
+                           sgd._global_learning_rate().numpy(), out_loss, train_batch_cost))
 
             print("one epoch finished", epoch_id)
             print("time cost ", time.time() - start_time)
diff --git a/dygraph/resnet/train.py b/dygraph/resnet/train.py
index d21f650b710c2cbe31415de7b434ccce80f9baf4..e92a39bde5bce633dda9452d5c0dad3399092248 100644
--- a/dygraph/resnet/train.py
+++ b/dygraph/resnet/train.py
@@ -25,6 +25,7 @@ from paddle.fluid import framework
 
 import math
 import sys
+import time
 
 IMAGENET1000 = 1281167
 base_lr = 0.1
@@ -45,6 +46,9 @@ def parse_args():
     parser.add_argument(
         "-b", "--batch_size", default=32, type=int, help="set epoch")
     parser.add_argument("--ce", action="store_true", help="run ce")
+   
+    # NOTE:used in benchmark
+    parser.add_argument("--max_iter", default=0, type=int, help="the max iters to train, used in benchmark")
     args = parser.parse_args()
     return args
 
@@ -310,6 +314,9 @@ def train_resnet():
         #file_name = './model/epoch_0.npz'
         #model_data = np.load( file_name )
 
+        #NOTE: used in benchmark 
+        total_batch_num = 0
+
         for eop in range(epoch):
 
             resnet.train()
@@ -325,6 +332,12 @@ def train_resnet():
             print("load finished")
 
             for batch_id, data in enumerate(train_reader()):
+
+                #NOTE: used in benchmark
+                if args.max_iter and total_batch_num == args.max_iter:
+                    return
+                batch_start = time.time()
+
                 dy_x_data = np.array(
                     [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
                 if len(np.array([x[1]
@@ -356,15 +369,18 @@ def train_resnet():
                 optimizer.minimize(avg_loss)
                 resnet.clear_gradients()
 
+                batch_end = time.time()
+                train_batch_cost = batch_end - batch_start
                 total_loss += dy_out
                 total_acc1 += acc_top1.numpy()
                 total_acc5 += acc_top5.numpy()
                 total_sample += 1
+                total_batch_num = total_batch_num + 1 #this is for benchmark
                 #print("epoch id: %d, batch step: %d, loss: %f" % (eop, batch_id, dy_out))
                 if batch_id % 10 == 0:
-                    print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f" % \
+                    print( "epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f, batch cost: %.5f" % \
                            ( eop, batch_id, total_loss / total_sample, \
-                             total_acc1 / total_sample, total_acc5 / total_sample))
+                             total_acc1 / total_sample, total_acc5 / total_sample, train_batch_cost))
 
             if args.ce:
                 print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample))
diff --git a/dygraph/similarity_net/README.md b/dygraph/similarity_net/README.md
index 4f7270b0534f99be21b3ae786db374315e10006d..dfb6ecbb0e0b37b6fcca51e90372db980eb3b718 100644
--- a/dygraph/similarity_net/README.md
+++ b/dygraph/similarity_net/README.md
@@ -23,7 +23,7 @@
 ## 快速开始
 #### 版本依赖
 
-本项目依赖于 Paddlepaddle Fluid 1.7，请参考[安装指南](http://www.paddlepaddle.org/#quick-start)进行安装。
+本项目依赖于 Paddlepaddle Fluid 1.8，请参考[安装指南](https://www.paddlepaddle.org.cn/install/quick)进行安装。
 
 
 #### 安装代码
diff --git a/dygraph/similarity_net/model_check.py b/dygraph/similarity_net/model_check.py
index 51713452a7f0b1019c7b8b7d37d24e0c5f15c77c..853a4a92d71f708e6996134e9c73e3c92b8e4f5f 100644
--- a/dygraph/similarity_net/model_check.py
+++ b/dygraph/similarity_net/model_check.py
@@ -33,20 +33,21 @@ def check_cuda(use_cuda, err = \
     except Exception as e:
         pass
 
+
 def check_version():
-        """
+    """
         Log error and exit when the installed version of paddlepaddle is
         not satisfied.
         """
-        err = "PaddlePaddle version 1.6 or higher is required, " \
-            "or a suitable develop version is satisfied as well. \n" \
-            "Please make sure the version is good with your code." \
+    err = "PaddlePaddle version 1.6 or higher is required, " \
+        "or a suitable develop version is satisfied as well. \n" \
+        "Please make sure the version is good with your code." \
 
-        try:
-            fluid.require_version('1.6.0')
-        except Exception as e:
-            print(err)
-            sys.exit(1)
+    try:
+        fluid.require_version('1.8.0')
+    except Exception as e:
+        print(err)
+        sys.exit(1)
 
 
 def check_version():
@@ -59,7 +60,7 @@ def check_version():
         "Please make sure the version is good with your code." \
 
     try:
-        fluid.require_version('1.6.0')
+        fluid.require_version('1.8.0')
     except Exception as e:
         print(err)
         sys.exit(1)
diff --git a/dygraph/similarity_net/nets/paddle_layers.py b/dygraph/similarity_net/nets/paddle_layers.py
index 6a797c22f7bd1ae624af3de1474eaedca59fb257..d0f5c0bff35b7ef9f81c096c7ced30230b1d1460 100644
--- a/dygraph/similarity_net/nets/paddle_layers.py
+++ b/dygraph/similarity_net/nets/paddle_layers.py
@@ -30,6 +30,7 @@ import paddle.fluid.layers.utils as utils
 from paddle.fluid.dygraph import Embedding, Conv2D, GRUUnit, Layer, to_variable
 from paddle.fluid.layers.utils import map_structure, flatten, pack_sequence_as
 
+
 class EmbeddingLayer(object):
     """
     Embedding Layer class
@@ -52,12 +53,12 @@ class EmbeddingLayer(object):
             size=[self.dict_size, self.emb_dim],
             is_sparse=True,
             padding_idx=self.padding_idx,
-            param_attr=attr.ParamAttr(name=self.name, initializer=fluid.initializer.Xavier()))
+            param_attr=attr.ParamAttr(
+                name=self.name, initializer=fluid.initializer.Xavier()))
 
         return emb
 
 
-
 class FCLayer(object):
     """
     Fully Connect Layer class
@@ -76,9 +77,9 @@ class FCLayer(object):
         operation
         """
         fc = FC(size=self.fc_dim,
-                    param_attr=attr.ParamAttr(name="%s.w" % self.name),
-                    bias_attr=attr.ParamAttr(name="%s.b" % self.name),
-                    act=self.act)
+                param_attr=attr.ParamAttr(name="%s.w" % self.name),
+                bias_attr=attr.ParamAttr(name="%s.b" % self.name),
+                act=self.act)
         return fc
 
 
@@ -93,7 +94,7 @@ class DynamicGRULayer(object):
         """
         self.gru_dim = gru_dim
         self.name = name
-  
+
     def ops(self):
         """
         operation
@@ -117,11 +118,13 @@ class DynamicLSTMLayer(object):
         self.lstm_dim = lstm_dim
         self.name = name
         self.is_reverse = is_reverse
+
     def ops(self):
         """
         operation
         """
-        lstm_cell = BasicLSTMUnit(hidden_size=self.lstm_dim, input_size=self.lstm_dim*4)
+        lstm_cell = BasicLSTMUnit(
+            hidden_size=self.lstm_dim, input_size=self.lstm_dim * 4)
         lstm = RNN(cell=lstm_cell, time_major=True, is_reverse=self.is_reverse)
         return lstm
 
@@ -141,7 +144,7 @@ class DataLayer(object):
         """
         operation
         """
-        data = fluid.layers.data( 
+        data = fluid.data(
             name=name, shape=shape, dtype=dtype, lod_level=lod_level)
         return data
 
@@ -314,8 +317,10 @@ class ConstantLayer(object):
         """
         operation
         """
-        constant = fluid.layers.fill_constant_batch_size_like(input, shape,
-                                                              dtype, value)
+        shape = list(shape)
+        input_shape = fluid.layers.shape(input)
+        shape[0] = input_shape[0]
+        constant = fluid.layers.fill_constant(shape, dtype, value)
         return constant
 
 
@@ -358,26 +363,23 @@ class SoftsignLayer(object):
 
 
 class SimpleConvPool(Layer):
-    def __init__(self,
-                 num_channels,
-                 num_filters,
-                 filter_size,
-                 use_cudnn=False
-                 ):
+    def __init__(self, num_channels, num_filters, filter_size, use_cudnn=False):
         super(SimpleConvPool, self).__init__()
-        self._conv2d = Conv2D(num_channels = num_channels,
+        self._conv2d = Conv2D(
+            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
-            padding=[1, 1],                 
+            padding=[1, 1],
             use_cudnn=use_cudnn,
             act='relu')
 
     def forward(self, inputs):
         x = self._conv2d(inputs)
         x = fluid.layers.reduce_max(x, dim=-1)
-        x = fluid.layers.reshape(x, shape=[x.shape[0],  -1])
+        x = fluid.layers.reshape(x, shape=[x.shape[0], -1])
         return x
 
+
 class FC(Layer):
     """
     This interface is used to construct a callable object of the ``FC`` class.
@@ -580,7 +582,7 @@ class DynamicGRU(Layer):
                  gate_activation='sigmoid',
                  candidate_activation='tanh',
                  origin_mode=False,
-                 init_size = None):
+                 init_size=None):
         super(DynamicGRU, self).__init__()
         self.gru_unit = GRUUnit(
             size * 3,
@@ -591,16 +593,19 @@ class DynamicGRU(Layer):
             origin_mode=origin_mode)
         self.size = size
         self.is_reverse = is_reverse
+
     def forward(self, inputs, h_0):
         hidden = h_0
         res = []
         for i in range(inputs.shape[1]):
             if self.is_reverse:
                 i = inputs.shape[1] - 1 - i
-            input_ = inputs[ :, i:i+1, :]
-            input_ = fluid.layers.reshape(input_, [-1, input_.shape[2]], inplace=False)
+            input_ = inputs[:, i:i + 1, :]
+            input_ = fluid.layers.reshape(
+                input_, [-1, input_.shape[2]], inplace=False)
             hidden, reset, gate = self.gru_unit(input_, hidden)
-            hidden_ = fluid.layers.reshape(hidden, [-1, 1, hidden.shape[1]], inplace=False)
+            hidden_ = fluid.layers.reshape(
+                hidden, [-1, 1, hidden.shape[1]], inplace=False)
             res.append(hidden_)
         if self.is_reverse:
             res = res[::-1]
@@ -786,18 +791,21 @@ class BasicLSTMUnit(RNNUnit):
 
         self._weight = self.create_parameter(
             attr=self._param_attr,
-            shape=[self._input_size + self._hidden_size, 4 * self._hidden_size],
+            shape=[
+                self._input_size + self._hidden_size, 4 * self._hidden_size
+            ],
             dtype=self._dtype)
-        
-        self._bias = self.create_parameter(attr=self._bias_attr,
-                                           shape=[4 * self._hidden_size],
-                                           dtype=self._dtype,
-                                           is_bias=True)
+
+        self._bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=[4 * self._hidden_size],
+            dtype=self._dtype,
+            is_bias=True)
 
     def forward(self, input, state):
         pre_hidden, pre_cell = state
         concat_input_hidden = layers.concat([input, pre_hidden], axis=1)
-    
+
         gate_input = layers.matmul(x=concat_input_hidden, y=self._weight)
 
         gate_input = layers.elementwise_add(gate_input, self._bias)
@@ -817,11 +825,7 @@ class BasicLSTMUnit(RNNUnit):
 
 
 class RNN(Layer):
-    def __init__(self,
-                 cell,
-                 is_reverse=False,
-                 time_major=False,
-                 **kwargs):
+    def __init__(self, cell, is_reverse=False, time_major=False, **kwargs):
         super(RNN, self).__init__()
         self.cell = cell
         if not hasattr(self.cell, "call"):
@@ -831,12 +835,17 @@ class RNN(Layer):
         self.batch_index, self.time_step_index = (1, 0) if time_major else (0,
                                                                             1)
 
-    def forward(self, inputs, initial_states=None, sequence_length=None, **kwargs):
+    def forward(self,
+                inputs,
+                initial_states=None,
+                sequence_length=None,
+                **kwargs):
         if fluid.in_dygraph_mode():
 
             class OutputArray(object):
                 def __init__(self, x):
                     self.array = [x]
+
                 def append(self, x):
                     self.array.append(x)
 
@@ -844,9 +853,8 @@ class RNN(Layer):
                 # TODO: use where_op
                 new_state = fluid.layers.elementwise_mul(
                     new_state, step_mask,
-                    axis=0) - fluid.layers.elementwise_mul(state,
-                                                           (step_mask - 1),
-                                                           axis=0)
+                    axis=0) - fluid.layers.elementwise_mul(
+                        state, (step_mask - 1), axis=0)
                 return new_state
 
             flat_inputs = flatten(inputs)
@@ -872,16 +880,20 @@ class RNN(Layer):
 
             if self.is_reverse:
                 inputs = map_structure(lambda x: fluid.layers.reverse(x, axis=[0]), inputs)
-                mask = fluid.layers.reverse(mask, axis=[0]) if sequence_length is not None else None
+                mask = fluid.layers.reverse(
+                    mask, axis=[0]) if sequence_length is not None else None
 
             states = initial_states
             outputs = []
             for i in range(time_steps):
-                step_inputs = map_structure(lambda x:x[i], inputs)
-                step_outputs, new_states = self.cell(step_inputs, states, **kwargs)
+                step_inputs = map_structure(lambda x: x[i], inputs)
+                step_outputs, new_states = self.cell(step_inputs, states,
+                                                     **kwargs)
                 if sequence_length is not None:
                     new_states = map_structure(
-                        partial(_maybe_copy, step_mask=mask[i]), states,
+                        partial(
+                            _maybe_copy, step_mask=mask[i]),
+                        states,
                         new_states)
                 states = new_states
                 if i == 0:
@@ -922,10 +934,9 @@ class EncoderCell(RNNUnit):
         self.lstm_cells = list()
         for i in range(self.num_layers):
             self.lstm_cells.append(
-                self.add_sublayer(
-                    "layer_%d" % i,
-                    BasicLSTMUnit(input_size if i == 0 else hidden_size,
-                                  hidden_size)))
+                self.add_sublayer("layer_%d" % i,
+                                  BasicLSTMUnit(input_size if i == 0 else
+                                                hidden_size, hidden_size)))
 
     def forward(self, step_input, states):
         new_states = []
@@ -1040,4 +1051,3 @@ class BasicGRUUnit(Layer):
         new_hidden = u * pre_hidden + (1 - u) * c
 
         return new_hidden
-
diff --git a/dygraph/similarity_net/run_classifier.py b/dygraph/similarity_net/run_classifier.py
index ff82fdfbaefd67e397af2d31a4159988ffaeabf6..ff464e43d922739e113385f3668f4852764b913a 100644
--- a/dygraph/similarity_net/run_classifier.py
+++ b/dygraph/similarity_net/run_classifier.py
@@ -47,18 +47,18 @@ from utils import load_dygraph
 from model_check import check_version
 from model_check import check_cuda
 
-            
+
 def train(conf_dict, args):
     """
     train process
     """
-    
+
     # Get device
     if args.use_cuda:
         place = fluid.CUDAPlace(0)
     else:
         place = fluid.CPUPlace()
-    
+
     # run train
     logging.info("start train process ...")
 
@@ -84,7 +84,6 @@ def train(conf_dict, args):
             return auc, acc
         else:
             return auc
-    
 
     with fluid.dygraph.guard(place):
         # used for continuous evaluation 
@@ -100,35 +99,35 @@ def train(conf_dict, args):
         conf_dict['seq_len'] = args.seq_len
 
         # Load network structure dynamically
-        net = utils.import_class("./nets",
-                                conf_dict["net"]["module_name"],
-                                conf_dict["net"]["class_name"])(conf_dict)
+        net = utils.import_class("./nets", conf_dict["net"]["module_name"],
+                                 conf_dict["net"]["class_name"])(conf_dict)
         if args.init_checkpoint is not "":
             model, _ = load_dygraph(args.init_checkpoint)
             net.set_dict(model)
         # Load loss function dynamically
         loss = utils.import_class("./nets/losses",
-                                conf_dict["loss"]["module_name"],
-                                conf_dict["loss"]["class_name"])(conf_dict)
+                                  conf_dict["loss"]["module_name"],
+                                  conf_dict["loss"]["class_name"])(conf_dict)
         # Load Optimization method
         learning_rate = conf_dict["optimizer"]["learning_rate"]
         optimizer_name = conf_dict["optimizer"]["class_name"]
-        if optimizer_name=='SGDOptimizer':
-            optimizer = fluid.optimizer.SGDOptimizer(learning_rate,parameter_list=net.parameters())
-        elif optimizer_name=='AdamOptimizer':
+        if optimizer_name == 'SGDOptimizer':
+            optimizer = fluid.optimizer.SGDOptimizer(
+                learning_rate, parameter_list=net.parameters())
+        elif optimizer_name == 'AdamOptimizer':
             beta1 = conf_dict["optimizer"]["beta1"]
             beta2 = conf_dict["optimizer"]["beta2"]
             epsilon = conf_dict["optimizer"]["epsilon"]
             optimizer = fluid.optimizer.AdamOptimizer(
-            learning_rate,
-            beta1=beta1,
-            beta2=beta2,
-            epsilon=epsilon,
-            parameter_list=net.parameters())
+                learning_rate,
+                beta1=beta1,
+                beta2=beta2,
+                epsilon=epsilon,
+                parameter_list=net.parameters())
 
         # load auc method
         metric = fluid.metrics.Auc(name="auc")
-        simnet_process = reader.SimNetProcessor(args, vocab) 
+        simnet_process = reader.SimNetProcessor(args, vocab)
 
         # set global step
         global_step = 0
@@ -136,23 +135,33 @@ def train(conf_dict, args):
         losses = []
         start_time = time.time()
 
-        train_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True)
-        get_train_examples = simnet_process.get_reader("train",epoch=args.epoch)
-        train_pyreader.decorate_sample_list_generator(
-                paddle.batch(get_train_examples, batch_size=args.batch_size),
-                place)
+        train_loader = fluid.io.DataLoader.from_generator(
+            capacity=16,
+            return_list=True,
+            iterable=True,
+            use_double_buffer=True)
+        get_train_examples = simnet_process.get_reader(
+            "train", epoch=args.epoch)
+        train_loader.set_sample_list_generator(
+            paddle.batch(
+                get_train_examples, batch_size=args.batch_size), place)
         if args.do_valid:
-            valid_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True)
-            get_valid_examples = simnet_process.get_reader("valid")  
-            valid_pyreader.decorate_sample_list_generator(
-                paddle.batch(get_valid_examples, batch_size=args.batch_size),
+            valid_loader = fluid.io.DataLoader.from_generator(
+                capacity=16,
+                return_list=True,
+                iterable=True,
+                use_double_buffer=True)
+            get_valid_examples = simnet_process.get_reader("valid")
+            valid_loader.set_sample_list_generator(
+                paddle.batch(
+                    get_valid_examples, batch_size=args.batch_size),
                 place)
             pred_list = []
 
         if args.task_mode == "pairwise":
-         
-            for left, pos_right, neg_right in train_pyreader():
-                
+
+            for left, pos_right, neg_right in train_loader():
+
                 left = fluid.layers.reshape(left, shape=[-1, 1])
                 pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
                 neg_right = fluid.layers.reshape(neg_right, shape=[-1, 1])
@@ -162,92 +171,98 @@ def train(conf_dict, args):
                 pred = pos_score
                 _, neg_score = net(left, neg_right)
                 avg_cost = loss.compute(pos_score, neg_score)
-                losses.append(np.mean(avg_cost.numpy()))  
+                losses.append(np.mean(avg_cost.numpy()))
                 avg_cost.backward()
                 optimizer.minimize(avg_cost)
                 net.clear_gradients()
-                
+
                 if args.do_valid and global_step % args.validation_steps == 0:
-                    for left, pos_right in valid_pyreader():
+                    for left, pos_right in valid_loader():
                         left = fluid.layers.reshape(left, shape=[-1, 1])
-                        pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
+                        pos_right = fluid.layers.reshape(
+                            pos_right, shape=[-1, 1])
                         net.eval()
                         left_feat, pos_score = net(left, pos_right)
                         pred = pos_score
-                       
-                        pred_list += list(pred.numpy())  
-                    valid_result = valid_and_test(pred_list, simnet_process, "valid")     
+
+                        pred_list += list(pred.numpy())
+                    valid_result = valid_and_test(pred_list, simnet_process,
+                                                  "valid")
                     if args.compute_accuracy:
                         valid_auc, valid_acc = valid_result
                         logging.info(
-                            "global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" %
-                            (global_step, valid_auc, valid_acc, np.mean(losses)))
+                            "global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f"
+                            % (global_step, valid_auc, valid_acc,
+                               np.mean(losses)))
                     else:
                         valid_auc = valid_result
-                        logging.info("global_steps: %d, valid_auc: %f, valid_loss: %f" %
-                                    (global_step, valid_auc, np.mean(losses)))
+                        logging.info(
+                            "global_steps: %d, valid_auc: %f, valid_loss: %f" %
+                            (global_step, valid_auc, np.mean(losses)))
 
                 if global_step % args.save_steps == 0:
                     model_save_dir = os.path.join(args.output_dir,
-                                                conf_dict["model_path"])
+                                                  conf_dict["model_path"])
                     model_path = os.path.join(model_save_dir, str(global_step))
-                        
+
                     if not os.path.exists(model_save_dir):
                         os.makedirs(model_save_dir)
                     fluid.dygraph.save_dygraph(net.state_dict(), model_path)
-                    
+
                     logging.info("saving infer model in %s" % model_path)
         else:
-            for left, right, label in train_pyreader():
+            for left, right, label in train_loader():
                 left = fluid.layers.reshape(left, shape=[-1, 1])
                 right = fluid.layers.reshape(right, shape=[-1, 1])
                 label = fluid.layers.reshape(label, shape=[-1, 1])
                 net.train()
-                global_step += 1 
+                global_step += 1
                 left_feat, pred = net(left, right)
                 avg_cost = loss.compute(pred, label)
-                losses.append(np.mean(avg_cost.numpy())) 
+                losses.append(np.mean(avg_cost.numpy()))
                 avg_cost.backward()
                 optimizer.minimize(avg_cost)
                 net.clear_gradients()
-            
+
                 if args.do_valid and global_step % args.validation_steps == 0:
-                    for left, right in valid_pyreader():
+                    for left, right in valid_loader():
                         left = fluid.layers.reshape(left, shape=[-1, 1])
                         right = fluid.layers.reshape(right, shape=[-1, 1])
                         net.eval()
                         left_feat, pred = net(left, right)
                         pred_list += list(pred.numpy())
-                    valid_result = valid_and_test(pred_list, simnet_process, "valid")     
+                    valid_result = valid_and_test(pred_list, simnet_process,
+                                                  "valid")
                     if args.compute_accuracy:
                         valid_auc, valid_acc = valid_result
                         logging.info(
-                            "global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f" %
-                            (global_step, valid_auc, valid_acc, np.mean(losses)))
+                            "global_steps: %d, valid_auc: %f, valid_acc: %f, valid_loss: %f"
+                            % (global_step, valid_auc, valid_acc,
+                               np.mean(losses)))
                     else:
                         valid_auc = valid_result
-                        logging.info("global_steps: %d, valid_auc: %f, valid_loss: %f" %
-                                    (global_step, valid_auc, np.mean(losses)))
+                        logging.info(
+                            "global_steps: %d, valid_auc: %f, valid_loss: %f" %
+                            (global_step, valid_auc, np.mean(losses)))
 
                 if global_step % args.save_steps == 0:
                     model_save_dir = os.path.join(args.output_dir,
-                                                conf_dict["model_path"])
+                                                  conf_dict["model_path"])
                     model_path = os.path.join(model_save_dir, str(global_step))
-                        
+
                     if not os.path.exists(model_save_dir):
                         os.makedirs(model_save_dir)
                     fluid.dygraph.save_dygraph(net.state_dict(), model_path)
-                    
+
                     logging.info("saving infer model in %s" % model_path)
 
-        end_time = time.time()     
+        end_time = time.time()
         ce_info.append([np.mean(losses), end_time - start_time])
         # final save
-        logging.info("the final step is %s" % global_step)    
-        model_save_dir = os.path.join(args.output_dir,
-                                    conf_dict["model_path"])
+        logging.info("the final step is %s" % global_step)
+        model_save_dir = os.path.join(args.output_dir, conf_dict["model_path"])
         model_path = os.path.join(model_save_dir, str(global_step))
-        
+
         if not os.path.exists(model_save_dir):
             os.makedirs(model_save_dir)
         fluid.dygraph.save_dygraph(net.state_dict(), model_path)
@@ -263,19 +278,24 @@ def train(conf_dict, args):
             except:
                 logging.info("ce info err!")
             print("kpis\teach_step_duration_%s_card%s\t%s" %
-                (args.task_name, card_num, ce_time))
+                  (args.task_name, card_num, ce_time))
             print("kpis\ttrain_loss_%s_card%s\t%f" %
-                (args.task_name, card_num, ce_loss))
+                  (args.task_name, card_num, ce_loss))
 
         if args.do_test:
             # Get Feeder and Reader
-            test_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True)
+            test_loader = fluid.io.DataLoader.from_generator(
+                capacity=16,
+                return_list=True,
+                iterable=True,
+                use_double_buffer=True)
             get_test_examples = simnet_process.get_reader("test")
-            test_pyreader.decorate_sample_list_generator(
-                paddle.batch(get_test_examples, batch_size=args.batch_size),
+            test_loader.set_sample_list_generator(
+                paddle.batch(
+                    get_test_examples, batch_size=args.batch_size),
                 place)
             pred_list = []
-            for left, pos_right in test_pyreader():
+            for left, pos_right in test_loader():
                 left = fluid.layers.reshape(left, shape=[-1, 1])
                 pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
                 net.eval()
@@ -284,15 +304,15 @@ def train(conf_dict, args):
                 left_feat, pos_score = net(left, pos_right)
                 pred = pos_score
                 pred_list += list(pred.numpy())
-            test_result = valid_and_test(pred_list, simnet_process, "test") 
-            if args.compute_accuracy: 
+            test_result = valid_and_test(pred_list, simnet_process, "test")
+            if args.compute_accuracy:
                 test_auc, test_acc = test_result
                 logging.info("AUC of test is %f, Accuracy of test is %f" %
-                            (test_auc, test_acc))
+                             (test_auc, test_acc))
             else:
                 test_auc = test_result
                 logging.info("AUC of test is %f" % test_auc)
-        
+
 
 def test(conf_dict, args):
     """
@@ -307,47 +327,53 @@ def test(conf_dict, args):
 
         vocab = utils.load_vocab(args.vocab_path)
         simnet_process = reader.SimNetProcessor(args, vocab)
-        test_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True)
+        test_loader = fluid.io.DataLoader.from_generator(
+            capacity=16,
+            return_list=True,
+            iterable=True,
+            use_double_buffer=True)
         get_test_examples = simnet_process.get_reader("test")
-        test_pyreader.decorate_sample_list_generator(
-                paddle.batch(get_test_examples, batch_size=args.batch_size),
-                place)
-     
+        test_loader.set_sample_list_generator(
+            paddle.batch(
+                get_test_examples, batch_size=args.batch_size), place)
 
-        conf_dict['dict_size'] = len(vocab)         
+        conf_dict['dict_size'] = len(vocab)
         conf_dict['seq_len'] = args.seq_len
 
-        net = utils.import_class("./nets",
-                                conf_dict["net"]["module_name"],
-                                conf_dict["net"]["class_name"])(conf_dict)
-       
+        net = utils.import_class("./nets", conf_dict["net"]["module_name"],
+                                 conf_dict["net"]["class_name"])(conf_dict)
+
         model, _ = load_dygraph(args.init_checkpoint)
         net.set_dict(model)
         metric = fluid.metrics.Auc(name="auc")
         pred_list = []
-        with io.open("predictions.txt", "w", encoding="utf8") as predictions_file:
+        with io.open(
+                "predictions.txt", "w", encoding="utf8") as predictions_file:
             if args.task_mode == "pairwise":
-                for left, pos_right in test_pyreader():
+                for left, pos_right in test_loader():
                     left = fluid.layers.reshape(left, shape=[-1, 1])
                     pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
-                
+
                     left_feat, pos_score = net(left, pos_right)
                     pred = pos_score
 
-                    pred_list += list(map(lambda item: float(item[0]), pred.numpy()))
+                    pred_list += list(
+                        map(lambda item: float(item[0]), pred.numpy()))
                     predictions_file.write(u"\n".join(
-                            map(lambda item: str((item[0] + 1) / 2), pred.numpy())) + "\n")
+                        map(lambda item: str((item[0] + 1) / 2), pred.numpy()))
+                                           + "\n")
 
             else:
-                for left, right in test_pyreader():
+                for left, right in test_loader():
                     left = fluid.layers.reshape(left, shape=[-1, 1])
                     right = fluid.layers.reshape(right, shape=[-1, 1])
                     left_feat, pred = net(left, right)
 
-                    pred_list += list(map(lambda item: float(item[0]), pred.numpy()))
+                    pred_list += list(
+                        map(lambda item: float(item[0]), pred.numpy()))
                     predictions_file.write(u"\n".join(
-                            map(lambda item: str(np.argmax(item)), pred.numpy())) + "\n")
-
+                        map(lambda item: str(np.argmax(item)), pred.numpy())) +
+                                           "\n")
 
             if args.task_mode == "pairwise":
                 pred_list = np.array(pred_list).reshape((-1, 1))
@@ -361,16 +387,16 @@ def test(conf_dict, args):
             metric.update(pred_list, labels)
             if args.compute_accuracy:
                 acc = utils.get_accuracy(pred_list, labels, args.task_mode,
-                                        args.lamda)
+                                         args.lamda)
                 logging.info("AUC of test is %f, Accuracy of test is %f" %
-                            (metric.eval(), acc))
+                             (metric.eval(), acc))
             else:
                 logging.info("AUC of test is %f" % metric.eval())
 
         if args.verbose_result:
             utils.get_result_file(args)
             logging.info("test result saved in %s" %
-                        os.path.join(os.getcwd(), args.test_result_path))
+                         os.path.join(os.getcwd(), args.test_result_path))
 
 
 def infer(conf_dict, args):
@@ -382,50 +408,53 @@ def infer(conf_dict, args):
         place = fluid.CUDAPlace(0)
     else:
         place = fluid.CPUPlace()
-   
 
     with fluid.dygraph.guard(place):
         vocab = utils.load_vocab(args.vocab_path)
         simnet_process = reader.SimNetProcessor(args, vocab)
         get_infer_examples = simnet_process.get_infer_reader
-        infer_pyreader = fluid.io.PyReader(capacity=16, return_list=True, use_double_buffer=True)
-        infer_pyreader.decorate_sample_list_generator(
-                    paddle.batch(get_infer_examples, batch_size=args.batch_size),
-                    place) 
-        
-        conf_dict['dict_size'] = len(vocab) 
+        infer_loader = fluid.io.DataLoader.from_generator(
+            capacity=16,
+            return_list=True,
+            iterable=True,
+            use_double_buffer=True)
+        infer_loader.set_sample_list_generator(
+            paddle.batch(
+                get_infer_examples, batch_size=args.batch_size), place)
+
+        conf_dict['dict_size'] = len(vocab)
         conf_dict['seq_len'] = args.seq_len
 
-        net = utils.import_class("./nets",
-                                conf_dict["net"]["module_name"],
-                                conf_dict["net"]["class_name"])(conf_dict)
+        net = utils.import_class("./nets", conf_dict["net"]["module_name"],
+                                 conf_dict["net"]["class_name"])(conf_dict)
         model, _ = load_dygraph(args.init_checkpoint)
         net.set_dict(model)
-        
+
         pred_list = []
         if args.task_mode == "pairwise":
-            for left, pos_right in infer_pyreader():
+            for left, pos_right in infer_loader():
                 left = fluid.layers.reshape(left, shape=[-1, 1])
                 pos_right = fluid.layers.reshape(pos_right, shape=[-1, 1])
-                
+
                 left_feat, pos_score = net(left, pos_right)
                 pred = pos_score
                 pred_list += list(
-                            map(lambda item: str((item[0] + 1) / 2), pred.numpy()))
-            
+                    map(lambda item: str((item[0] + 1) / 2), pred.numpy()))
+
         else:
-            for left, right in infer_pyreader():
+            for left, right in infer_loader():
                 left = fluid.layers.reshape(left, shape=[-1, 1])
                 pos_right = fluid.layers.reshape(right, shape=[-1, 1])
                 left_feat, pred = net(left, right)
-                pred_list += map(lambda item: str(np.argmax(item)), pred.numpy())
-        
-    
-        with io.open(args.infer_result_path, "w", encoding="utf8") as infer_file:
+                pred_list += map(lambda item: str(np.argmax(item)),
+                                 pred.numpy())
+
+        with io.open(
+                args.infer_result_path, "w", encoding="utf8") as infer_file:
             for _data, _pred in zip(simnet_process.get_infer_data(), pred_list):
                 infer_file.write(_data + "\t" + _pred + "\n")
         logging.info("infer result saved in %s" %
-                    os.path.join(os.getcwd(), args.infer_result_path))
+                     os.path.join(os.getcwd(), args.infer_result_path))
 
 
 def get_cards():
@@ -435,6 +464,7 @@ def get_cards():
         num = len(cards.split(","))
     return num
 
+
 if __name__ == "__main__":
 
     args = ArgConfig()
diff --git a/dygraph/transformer/README.md b/dygraph/transformer/README.md
index 6776e618a69fe35ed46552faf9512f58a07e7685..4b8247acdc81e1b1a5844da51e290c490a7fc4fa 100644
--- a/dygraph/transformer/README.md
+++ b/dygraph/transformer/README.md
@@ -28,7 +28,7 @@
 
 1. paddle安装
 
-   本项目依赖于 PaddlePaddle 1.7及以上版本或适当的develop版本，请参考 [安装指南](http://www.paddlepaddle.org/#quick-start) 进行安装
+   本项目依赖于 PaddlePaddle 1.8及以上版本或适当的develop版本，请参考 [安装指南](https://www.paddlepaddle.org.cn/install/quick) 进行安装
 
 2. 下载代码
 
@@ -40,7 +40,7 @@
 
 3. 环境依赖
 
-   请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/1.6/beginners_guide/install/index_cn.html)部分的内容
+   请参考PaddlePaddle[安装说明](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/index_cn.html)部分的内容
 
 
 ### 数据准备
@@ -76,6 +76,7 @@ python -u train.py \
   --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
   --special_token '<s>' '<e>' '<unk>' \
   --training_file gen_data/wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \
+  --validation_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \
   --batch_size 4096
 ```
 
@@ -91,6 +92,7 @@ python -u train.py \
   --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
   --special_token '<s>' '<e>' '<unk>' \
   --training_file gen_data/wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \
+  --validation_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \
   --batch_size 4096 \
   --n_head 16 \
   --d_model 1024 \
@@ -121,10 +123,11 @@ Paddle动态图支持多进程多卡进行模型训练，启动训练的方式
 ```sh
 python -m paddle.distributed.launch --started_port 8999 --selected_gpus=0,1,2,3,4,5,6,7 --log_dir ./mylog train.py \
   --epoch 30 \
-  --src_vocab_fpath wmt16_ende_data_bpe/vocab_all.bpe.32000 \
-  --trg_vocab_fpath wmt16_ende_data_bpe/vocab_all.bpe.32000 \
+  --src_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
+  --trg_vocab_fpath gen_data/wmt16_ende_data_bpe/vocab_all.bpe.32000 \
   --special_token '<s>' '<e>' '<unk>' \
-  --training_file wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \
+  --training_file gen_data/wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de \
+  --validation_file gen_data/wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de \
   --batch_size 4096 \
   --print_step 100 \
   --use_cuda True \
diff --git a/dygraph/transformer/config.py b/dygraph/transformer/config.py
deleted file mode 100644
index b6e1b2bb0a2bc590c36673e7ffcde43ccd65c538..0000000000000000000000000000000000000000
--- a/dygraph/transformer/config.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-class TrainTaskConfig(object):
-    """
-    TrainTaskConfig
-    """
-    # the epoch number to train.
-    pass_num = 20
-    # the number of sequences contained in a mini-batch.
-    # deprecated, set batch_size in args.
-    batch_size = 32
-    # the hyper parameters for Adam optimizer.
-    # This static learning_rate will be multiplied to the LearningRateScheduler
-    # derived learning rate the to get the final learning rate.
-    learning_rate = 2.0
-    beta1 = 0.9
-    beta2 = 0.997
-    eps = 1e-9
-    # the parameters for learning rate scheduling.
-    warmup_steps = 8000
-    # the weight used to mix up the ground-truth distribution and the fixed
-    # uniform distribution in label smoothing when training.
-    # Set this as zero if label smoothing is not wanted.
-    label_smooth_eps = 0.1
-
-
-class InferTaskConfig(object):
-    # the number of examples in one run for sequence generation.
-    batch_size = 4
-    # the parameters for beam search.
-    beam_size = 4
-    alpha=0.6
-    # max decoded length, should be less than ModelHyperParams.max_length
-    max_out_len = 30
-
-
-
-class ModelHyperParams(object):
-    """
-    ModelHyperParams
-    """
-    # These following five vocabularies related configurations will be set
-    # automatically according to the passed vocabulary path and special tokens.
-    # size of source word dictionary.
-    src_vocab_size = 10000
-    # size of target word dictionay
-    trg_vocab_size = 10000
-    # index for <bos> token
-    bos_idx = 0
-    # index for <eos> token
-    eos_idx = 1
-    # index for <unk> token
-    unk_idx = 2
-
-    # max length of sequences deciding the size of position encoding table.
-    max_length = 50
-    # the dimension for word embeddings, which is also the last dimension of
-    # the input and output of multi-head attention, position-wise feed-forward
-    # networks, encoder and decoder.
-    d_model = 512
-    # size of the hidden layer in position-wise feed-forward networks.
-    d_inner_hid = 2048
-    # the dimension that keys are projected to for dot-product attention.
-    d_key = 64
-    # the dimension that values are projected to for dot-product attention.
-    d_value = 64
-    # number of head used in multi-head attention.
-    n_head = 8
-    # number of sub-layers to be stacked in the encoder and decoder.
-    n_layer = 6
-    # dropout rates of different modules.
-    prepostprocess_dropout = 0.1
-    attention_dropout = 0.1
-    relu_dropout = 0.1
-    # to process before each sub-layer
-    preprocess_cmd = "n"  # layer normalization
-    # to process after each sub-layer
-    postprocess_cmd = "da"  # dropout + residual connection
-    # the flag indicating whether to share embedding and softmax weights.
-    # vocabularies in source and target should be same for weight sharing.
-    weight_sharing = False
-
-
-# The placeholder for batch_size in compile time. Must be -1 currently to be
-# consistent with some ops' infer-shape output in compile time, such as the
-# sequence_expand op used in beamsearch decoder.
-batch_size = -1
-# The placeholder for squence length in compile time.
-seq_len = ModelHyperParams.max_length
-# Here list the data shapes and data types of all inputs.
-# The shapes here act as placeholder and are set to pass the infer-shape in
-# compile time.
-input_descs = {
-    # The actual data shape of src_word is:
-    # [batch_size, max_src_len_in_batch, 1]
-    "src_word": [(batch_size, seq_len, 1), "int64", 2],
-    # The actual data shape of src_pos is:
-    # [batch_size, max_src_len_in_batch, 1]
-    "src_pos": [(batch_size, seq_len, 1), "int64"],
-    # This input is used to remove attention weights on paddings in the
-    # encoder.
-    # The actual data shape of src_slf_attn_bias is:
-    # [batch_size, n_head, max_src_len_in_batch, max_src_len_in_batch]
-    "src_slf_attn_bias":
-    [(batch_size, ModelHyperParams.n_head, seq_len, seq_len), "float32"],
-    # The actual data shape of trg_word is:
-    # [batch_size, max_trg_len_in_batch, 1]
-    "trg_word": [(batch_size, seq_len, 1), "int64",
-                 2],  # lod_level is only used in fast decoder.
-    # The actual data shape of trg_pos is:
-    # [batch_size, max_trg_len_in_batch, 1]
-    "trg_pos": [(batch_size, seq_len, 1), "int64"],
-    # This input is used to remove attention weights on paddings and
-    # subsequent words in the decoder.
-    # The actual data shape of trg_slf_attn_bias is:
-    # [batch_size, n_head, max_trg_len_in_batch, max_trg_len_in_batch]
-    "trg_slf_attn_bias":
-    [(batch_size, ModelHyperParams.n_head, seq_len, seq_len), "float32"],
-    # This input is used to remove attention weights on paddings of the source
-    # input in the encoder-decoder attention.
-    # The actual data shape of trg_src_attn_bias is:
-    # [batch_size, n_head, max_trg_len_in_batch, max_src_len_in_batch]
-    "trg_src_attn_bias":
-    [(batch_size, ModelHyperParams.n_head, seq_len, seq_len), "float32"],
-    # This input is used in independent decoder program for inference.
-    # The actual data shape of enc_output is:
-    # [batch_size, max_src_len_in_batch, d_model]
-    "enc_output": [(batch_size, seq_len, ModelHyperParams.d_model), "float32"],
-    # The actual data shape of label_word is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_word": [(batch_size * seq_len, 1), "int64"],
-    # This input is used to mask out the loss of paddding tokens.
-    # The actual data shape of label_weight is:
-    # [batch_size * max_trg_len_in_batch, 1]
-    "lbl_weight": [(batch_size * seq_len, 1), "float32"],
-    # This input is used in beam-search decoder.
-    "init_score": [(batch_size, 1), "float32", 2],
-    # This input is used in beam-search decoder for the first gather
-    # (cell states updation)
-    "init_idx": [(batch_size, ), "int32"],
-}
-
-# Names of word embedding table which might be reused for weight sharing.
-word_emb_param_names = (
-    "src_word_emb_table",
-    "trg_word_emb_table",
-)
-# Names of position encoding table which will be initialized externally.
-pos_enc_param_names = (
-    "src_pos_enc_table",
-    "trg_pos_enc_table",
-)
-# separated inputs for different usages.
-encoder_data_input_fields = (
-    "src_word",
-    "src_pos",
-    "src_slf_attn_bias",
-)
-decoder_data_input_fields = (
-    "trg_word",
-    "trg_pos",
-    "trg_slf_attn_bias",
-    "trg_src_attn_bias",
-    "enc_output",
-)
-label_data_input_fields = (
-    "lbl_word",
-    "lbl_weight",
-)
-# In fast decoder, trg_pos (only containing the current time step) is generated
-# by ops and trg_slf_attn_bias is not needed.
-fast_decoder_data_input_fields = (
-    "trg_word",
-    # "init_score",
-    # "init_idx",
-    "trg_src_attn_bias",
-)
-
-
-def merge_cfg_from_list(cfg_list, g_cfgs):
-    """
-    Set the above global configurations using the cfg_list.
-    """
-    assert len(cfg_list) % 2 == 0
-    for key, value in zip(cfg_list[0::2], cfg_list[1::2]):
-        for g_cfg in g_cfgs:
-            if hasattr(g_cfg, key):
-                try:
-                    value = eval(value)
-                except Exception:  # for file path
-                    pass
-                setattr(g_cfg, key, value)
-                break
diff --git a/dygraph/transformer/model.py b/dygraph/transformer/model.py
index 3e8ec488079fd21f8223639a1e375d70e7e0bfad..70f64f50d95584d26e5a2d6c34fe4cdd4013e6eb 100644
--- a/dygraph/transformer/model.py
+++ b/dygraph/transformer/model.py
@@ -18,12 +18,9 @@ import numpy as np
 
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
-from paddle.fluid.layers.utils import map_structure
 from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, to_variable
 from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay
 
-from config import word_emb_param_names, pos_enc_param_names
-
 
 def position_encoding_init(n_position, d_pos_vec):
     """
@@ -91,9 +88,9 @@ class PrePostProcessLayer(Layer):
                             bias_attr=fluid.ParamAttr(
                                 initializer=fluid.initializer.Constant(0.)))))
             elif cmd == "d":  # add dropout
-                if dropout_rate:
-                    self.functors.append(lambda x: layers.dropout(
-                        x, dropout_prob=dropout_rate, is_test=False))
+                self.functors.append(lambda x: layers.dropout(
+                    x, dropout_prob=dropout_rate, is_test=False)
+                                     if dropout_rate else x)
 
     def forward(self, x, residual=None):
         for i, cmd in enumerate(self.process_cmd):
@@ -128,28 +125,45 @@ class MultiHeadAttention(Layer):
                               output_dim=d_model,
                               bias_attr=False)
 
-    def forward(self, queries, keys, values, attn_bias, cache=None):
-        # compute q ,k ,v
-        keys = queries if keys is None else keys
-        values = keys if values is None else values
+    def _prepare_qkv(self, queries, keys, values, cache=None):
+        if keys is None:  # self-attention
+            keys, values = queries, queries
+            static_kv = False
+        else:  # cross-attention
+            static_kv = True
 
         q = self.q_fc(queries)
-        k = self.k_fc(keys)
-        v = self.v_fc(values)
-
-        # split head
         q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
         q = layers.transpose(x=q, perm=[0, 2, 1, 3])
-        k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
-        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
-        v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
-        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+
+        if cache is not None and static_kv and "static_k" in cache:
+            # for encoder-decoder attention in inference and has cached
+            k = cache["static_k"]
+            v = cache["static_v"]
+        else:
+            k = self.k_fc(keys)
+            v = self.v_fc(values)
+            k = layers.reshape(x=k, shape=[0, 0, self.n_head, self.d_key])
+            k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+            v = layers.reshape(x=v, shape=[0, 0, self.n_head, self.d_value])
+            v = layers.transpose(x=v, perm=[0, 2, 1, 3])
 
         if cache is not None:
-            cache_k, cache_v = cache["k"], cache["v"]
-            k = layers.concat([cache_k, k], axis=2)
-            v = layers.concat([cache_v, v], axis=2)
-            cache["k"], cache["v"] = k, v
+            if static_kv and not "static_k" in cache:
+                # for encoder-decoder attention in inference and has not cached
+                cache["static_k"], cache["static_v"] = k, v
+            elif not static_kv:
+                # for decoder self-attention in inference
+                cache_k, cache_v = cache["k"], cache["v"]
+                k = layers.concat([cache_k, k], axis=2)
+                v = layers.concat([cache_v, v], axis=2)
+                cache["k"], cache["v"] = k, v
+
+        return q, k, v
+
+    def forward(self, queries, keys, values, attn_bias, cache=None):
+        # compute q ,k ,v
+        q, k, v = self._prepare_qkv(queries, keys, values, cache)
 
         # scale dot product attention
         product = layers.matmul(x=q,
@@ -164,7 +178,7 @@ class MultiHeadAttention(Layer):
                                      dropout_prob=self.dropout_rate,
                                      is_test=False)
 
-            out = layers.matmul(weights, v)
+        out = layers.matmul(weights, v)
 
         # combine heads
         out = layers.transpose(out, perm=[0, 2, 1, 3])
@@ -381,7 +395,7 @@ class DecoderLayer(Layer):
 
         cross_attn_output = self.cross_attn(
             self.preprocesser2(self_attn_output), enc_output, enc_output,
-            cross_attn_bias)
+            cross_attn_bias, cache)
         cross_attn_output = self.postprocesser2(cross_attn_output,
                                                 self_attn_output)
 
@@ -584,13 +598,10 @@ class Transformer(Layer):
         Beam search with the alive and finished two queues, both have a beam size
         capicity separately. It includes `grow_topk` `grow_alive` `grow_finish` as
         steps.
-
         1. `grow_topk` selects the top `2*beam_size` candidates to avoid all getting
         EOS.
-
         2. `grow_alive` selects the top `beam_size` non-EOS candidates as the inputs
         of next decoding step.
-
         3. `grow_finish` compares the already finished candidates in the finished queue
         and newly added finished candidates from `grow_topk`, and selects the top
         `beam_size` finished candidates.
@@ -681,7 +692,8 @@ class Transformer(Layer):
                          finished_in_finished):
             max_length_penalty = np.power(((5. + max_len) / 6.), alpha)
             # The best possible score of the most likely alive sequence
-            lower_bound_alive_scores = alive_log_probs[:, 0] / max_length_penalty
+            lower_bound_alive_scores = alive_log_probs[:,
+                                                       0] / max_length_penalty
 
             # Now to compute the lowest score of a finished sequence in finished
             # If the sequence isn't finished, we multiply it's score by 0. since
@@ -810,6 +822,36 @@ class Transformer(Layer):
                     eos_id=1,
                     beam_size=4,
                     max_len=256):
+        if beam_size == 1:
+            return self._greedy_search(src_word,
+                                       src_pos,
+                                       src_slf_attn_bias,
+                                       trg_word,
+                                       trg_src_attn_bias,
+                                       bos_id=bos_id,
+                                       eos_id=eos_id,
+                                       max_len=max_len)
+        else:
+            return self._beam_search(src_word,
+                                     src_pos,
+                                     src_slf_attn_bias,
+                                     trg_word,
+                                     trg_src_attn_bias,
+                                     bos_id=bos_id,
+                                     eos_id=eos_id,
+                                     beam_size=beam_size,
+                                     max_len=max_len)
+
+    def _beam_search(self,
+                     src_word,
+                     src_pos,
+                     src_slf_attn_bias,
+                     trg_word,
+                     trg_src_attn_bias,
+                     bos_id=0,
+                     eos_id=1,
+                     beam_size=4,
+                     max_len=256):
         def expand_to_beam_size(tensor, beam_size):
             tensor = layers.reshape(tensor,
                                     [tensor.shape[0], 1] + tensor.shape[1:])
@@ -822,22 +864,30 @@ class Transformer(Layer):
                                   tensor.shape[2:])
 
         def split_batch_beams(tensor):
-            return fluid.layers.reshape(tensor,
-                                        shape=[-1, beam_size] +
-                                        list(tensor.shape[1:]))
+            return layers.reshape(tensor,
+                                  shape=[-1, beam_size] +
+                                  list(tensor.shape[1:]))
 
         def mask_probs(probs, finished, noend_mask_tensor):
             # TODO: use where_op
             finished = layers.cast(finished, dtype=probs.dtype)
-            probs = layers.elementwise_mul(
-                layers.expand(layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]),
-                noend_mask_tensor, axis=-1) - layers.elementwise_mul(probs, (finished - 1), axis=0)
+            probs = layers.elementwise_mul(layers.expand(
+                layers.unsqueeze(finished, [2]), [1, 1, self.trg_vocab_size]),
+                                           noend_mask_tensor,
+                                           axis=-1) - layers.elementwise_mul(
+                                               probs, (finished - 1), axis=0)
             return probs
 
         def gather(x, indices, batch_pos):
-            topk_coordinates = fluid.layers.stack([batch_pos, indices], axis=2)
+            topk_coordinates = layers.stack([batch_pos, indices], axis=2)
             return layers.gather_nd(x, topk_coordinates)
 
+        def update_states(func, caches):
+            for cache in caches:  # no need to update static_kv
+                cache["k"] = func(cache["k"])
+                cache["v"] = func(cache["v"])
+            return caches
+
         # run encoder
         enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias)
 
@@ -852,7 +902,7 @@ class Transformer(Layer):
             np.full([batch_size, beam_size], eos_id, dtype="int64"))
         noend_array = [-inf] * self.trg_vocab_size
         noend_array[eos_id] = 0
-        noend_mask_tensor = to_variable(np.array(noend_array,dtype="float32"))
+        noend_mask_tensor = to_variable(np.array(noend_array, dtype="float32"))
         batch_pos = layers.expand(
             layers.unsqueeze(
                 to_variable(np.arange(0, batch_size, 1, dtype="int64")), [1]),
@@ -874,7 +924,8 @@ class Transformer(Layer):
         trg_pos = layers.zeros_like(trg_word)
         trg_src_attn_bias = merge_batch_beams(
             expand_to_beam_size(trg_src_attn_bias, beam_size))
-        enc_output = merge_batch_beams(expand_to_beam_size(enc_output, beam_size))
+        enc_output = merge_batch_beams(
+            expand_to_beam_size(enc_output, beam_size))
         ## init states (caches) for transformer, need to be updated according to selected beam
         caches = [{
             "k":
@@ -893,30 +944,29 @@ class Transformer(Layer):
             trg_pos = layers.fill_constant(shape=trg_word.shape,
                                            dtype="int64",
                                            value=i)
-            caches = map_structure(  # can not be reshaped since the 0 size
+            caches = update_states(  # can not be reshaped since the 0 size
                 lambda x: x if i == 0 else merge_batch_beams(x), caches)
             logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
                                   enc_output, caches)
-            caches = map_structure(split_batch_beams, caches)
+            caches = update_states(split_batch_beams, caches)
             step_log_probs = split_batch_beams(
-                fluid.layers.log(fluid.layers.softmax(logits)))
+                layers.log(layers.softmax(logits)))
             step_log_probs = mask_probs(step_log_probs, finished,
                                         noend_mask_tensor)
             log_probs = layers.elementwise_add(x=step_log_probs,
-                                                    y=log_probs,
-                                                    axis=0)
+                                               y=log_probs,
+                                               axis=0)
             log_probs = layers.reshape(log_probs,
                                        [-1, beam_size * self.trg_vocab_size])
             scores = log_probs
-            topk_scores, topk_indices = fluid.layers.topk(input=scores,
-                                                          k=beam_size)
-            beam_indices = fluid.layers.elementwise_floordiv(
-                topk_indices, vocab_size_tensor)
-            token_indices = fluid.layers.elementwise_mod(
-                topk_indices, vocab_size_tensor)
+            topk_scores, topk_indices = layers.topk(input=scores, k=beam_size)
+            beam_indices = layers.elementwise_floordiv(topk_indices,
+                                                       vocab_size_tensor)
+            token_indices = layers.elementwise_mod(topk_indices,
+                                                   vocab_size_tensor)
 
             # update states
-            caches = map_structure(lambda x: gather(x, beam_indices, batch_pos),
+            caches = update_states(lambda x: gather(x, beam_indices, batch_pos),
                                    caches)
             log_probs = gather(log_probs, topk_indices, batch_pos)
             finished = gather(finished, beam_indices, batch_pos)
@@ -937,3 +987,75 @@ class Transformer(Layer):
         finished_scores = topk_scores
 
         return finished_seq, finished_scores
+
+    def _greedy_search(self,
+                       src_word,
+                       src_pos,
+                       src_slf_attn_bias,
+                       trg_word,
+                       trg_src_attn_bias,
+                       bos_id=0,
+                       eos_id=1,
+                       max_len=256):
+        # run encoder
+        enc_output = self.encoder(src_word, src_pos, src_slf_attn_bias)
+
+        # constant number
+        batch_size = enc_output.shape[0]
+        max_len = (enc_output.shape[1] + 20) if max_len is None else max_len
+        end_token_tensor = layers.fill_constant(shape=[batch_size, 1],
+                                                dtype="int64",
+                                                value=eos_id)
+
+        predict_ids = []
+        log_probs = layers.fill_constant(shape=[batch_size, 1],
+                                         dtype="float32",
+                                         value=0)
+        trg_word = layers.fill_constant(shape=[batch_size, 1],
+                                        dtype="int64",
+                                        value=bos_id)
+        finished = layers.fill_constant(shape=[batch_size, 1],
+                                        dtype="bool",
+                                        value=0)
+
+        ## init states (caches) for transformer
+        caches = [{
+            "k":
+            layers.fill_constant(shape=[batch_size, self.n_head, 0, self.d_key],
+                                 dtype=enc_output.dtype,
+                                 value=0),
+            "v":
+            layers.fill_constant(
+                shape=[batch_size, self.n_head, 0, self.d_value],
+                dtype=enc_output.dtype,
+                value=0),
+        } for i in range(self.n_layer)]
+
+        for i in range(max_len):
+            trg_pos = layers.fill_constant(shape=trg_word.shape,
+                                           dtype="int64",
+                                           value=i)
+            logits = self.decoder(trg_word, trg_pos, None, trg_src_attn_bias,
+                                  enc_output, caches)
+            step_log_probs = layers.log(layers.softmax(logits))
+            log_probs = layers.elementwise_add(x=step_log_probs,
+                                               y=log_probs,
+                                               axis=0)
+            scores = log_probs
+            topk_scores, topk_indices = layers.topk(input=scores, k=1)
+
+            finished = layers.logical_or(
+                finished, layers.equal(topk_indices, end_token_tensor))
+            trg_word = topk_indices
+            log_probs = topk_scores
+
+            predict_ids.append(topk_indices)
+
+            if layers.reduce_all(finished).numpy():
+                break
+
+        predict_ids = layers.stack(predict_ids, axis=0)
+        finished_seq = layers.transpose(predict_ids, [1, 2, 0])
+        finished_scores = topk_scores
+
+        return finished_seq, finished_scores
diff --git a/dygraph/transformer/reader.py b/dygraph/transformer/reader.py
index ef23c5e1e32fa4cee1ba5a42bb970a1a135879a0..0ac62a1b7d0e651f11ab18a40e165e5b496e10d6 100644
--- a/dygraph/transformer/reader.py
+++ b/dygraph/transformer/reader.py
@@ -306,6 +306,7 @@ class DataProcessor(object):
     :param seed: The seed for random.
     :type seed: int
     """
+
     def __init__(self,
                  src_vocab_fpath,
                  trg_vocab_fpath,
@@ -360,21 +361,23 @@ class DataProcessor(object):
 
     def load_src_trg_ids(self, fpattern, tar_fname):
         converters = [
-            Converter(vocab=self._src_vocab,
-                      beg=self._bos_idx,
-                      end=self._eos_idx,
-                      unk=self._unk_idx,
-                      delimiter=self._token_delimiter,
-                      add_beg=False)
+            Converter(
+                vocab=self._src_vocab,
+                beg=self._bos_idx,
+                end=self._eos_idx,
+                unk=self._unk_idx,
+                delimiter=self._token_delimiter,
+                add_beg=False)
         ]
         if not self._only_src:
             converters.append(
-                Converter(vocab=self._trg_vocab,
-                          beg=self._bos_idx,
-                          end=self._eos_idx,
-                          unk=self._unk_idx,
-                          delimiter=self._token_delimiter,
-                          add_beg=True))
+                Converter(
+                    vocab=self._trg_vocab,
+                    beg=self._bos_idx,
+                    end=self._eos_idx,
+                    unk=self._unk_idx,
+                    delimiter=self._token_delimiter,
+                    add_beg=True))
 
         converters = ComposedConverter(converters)
 
@@ -402,9 +405,8 @@ class DataProcessor(object):
             f = tarfile.open(fpaths[0], "rb")
             for line in f.extractfile(tar_fname):
                 fields = line.strip(b"\n").split(self._field_delimiter)
-                if (not self._only_src
-                        and len(fields) == 2) or (self._only_src
-                                                  and len(fields) == 1):
+                if (not self._only_src and len(fields) == 2) or (
+                        self._only_src and len(fields) == 1):
                     yield fields
         else:
             for fpath in fpaths:
@@ -414,9 +416,8 @@ class DataProcessor(object):
                 with open(fpath, "rb") as f:
                     for line in f:
                         fields = line.strip(b"\n").split(self._field_delimiter)
-                        if (not self._only_src
-                                and len(fields) == 2) or (self._only_src
-                                                          and len(fields) == 1):
+                        if (not self._only_src and len(fields) == 2) or (
+                                self._only_src and len(fields) == 1):
                             yield fields
 
     @staticmethod
@@ -512,8 +513,8 @@ class DataProcessor(object):
             for item in data_reader():
                 inst_num_per_part = len(item) // count
                 for i in range(count):
-                    yield item[inst_num_per_part * i:inst_num_per_part *
-                               (i + 1)]
+                    yield item[inst_num_per_part * i:inst_num_per_part * (i + 1
+                                                                          )]
 
         return __impl__
 
diff --git a/dygraph/transformer/train.py b/dygraph/transformer/train.py
index 96fe3bf1777f681de4625095dff7970ccfb7a590..75cbb2772103a0467d833b2544eaf1f4cd4f2610 100644
--- a/dygraph/transformer/train.py
+++ b/dygraph/transformer/train.py
@@ -24,7 +24,6 @@ import paddle.fluid as fluid
 
 from utils.configure import PDConfig
 from utils.check import check_gpu, check_version
-from utils.load import load_dygraph
 
 # include task-specific libs
 import reader
@@ -58,6 +57,25 @@ def do_train(args):
                                      max_length=args.max_length,
                                      n_head=args.n_head)
     batch_generator = processor.data_generator(phase="train")
+    if args.validation_file:
+        val_processor = reader.DataProcessor(
+            fpattern=args.validation_file,
+            src_vocab_fpath=args.src_vocab_fpath,
+            trg_vocab_fpath=args.trg_vocab_fpath,
+            token_delimiter=args.token_delimiter,
+            use_token_batch=args.use_token_batch,
+            batch_size=args.batch_size,
+            device_count=trainer_count,
+            pool_size=args.pool_size,
+            sort_type=args.sort_type,
+            shuffle=False,
+            shuffle_batch=False,
+            start_mark=args.special_token[0],
+            end_mark=args.special_token[1],
+            unk_mark=args.special_token[2],
+            max_length=args.max_length,
+            n_head=args.n_head)
+        val_batch_generator = val_processor.data_generator(phase="train")
     if trainer_count > 1:  # for multi-process gpu training
         batch_generator = fluid.contrib.reader.distributed_batch_reader(
             batch_generator)
@@ -74,6 +92,9 @@ def do_train(args):
         # define data loader
         train_loader = fluid.io.DataLoader.from_generator(capacity=10)
         train_loader.set_batch_generator(batch_generator, places=place)
+        if args.validation_file:
+            val_loader = fluid.io.DataLoader.from_generator(capacity=10)
+            val_loader.set_batch_generator(val_batch_generator, places=place)
 
         # define model
         transformer = Transformer(
@@ -98,13 +119,13 @@ def do_train(args):
 
         ## init from some checkpoint, to resume the previous training
         if args.init_from_checkpoint:
-            model_dict, opt_dict = load_dygraph(
+            model_dict, opt_dict = fluid.load_dygraph(
                 os.path.join(args.init_from_checkpoint, "transformer"))
             transformer.load_dict(model_dict)
             optimizer.set_dict(opt_dict)
         ## init from some pretrain models, to better solve the current task
         if args.init_from_pretrain_model:
-            model_dict, _ = load_dygraph(
+            model_dict, _ = fluid.load_dygraph(
                 os.path.join(args.init_from_pretrain_model, "transformer"))
             transformer.load_dict(model_dict)
 
@@ -123,11 +144,18 @@ def do_train(args):
         ce_time = []
         ce_ppl = []
         step_idx = 0
+
+        #NOTE: used for benchmark
+        total_batch_num = 0
+
         # train loop
         for pass_id in range(args.epoch):
             pass_start_time = time.time()
             batch_id = 0
             for input_data in train_loader():
+                if args.max_iter and total_batch_num == args.max_iter: #NOTE: used for benchmark
+                    return
+                batch_start = time.time()
                 (src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
                  trg_slf_attn_bias, trg_src_attn_bias, lbl_word,
                  lbl_weight) = input_data
@@ -167,13 +195,38 @@ def do_train(args):
                             total_avg_cost - loss_normalizer,
                             np.exp([min(total_avg_cost, 100)]),
                             args.print_step / (time.time() - avg_batch_time)))
-                        ce_ppl.append(np.exp([min(total_avg_cost, 100)]))
                         avg_batch_time = time.time()
 
-                if step_idx % args.save_step == 0 and step_idx != 0 and (
-                        trainer_count == 1
-                        or fluid.dygraph.parallel.Env().dev_id == 0):
-                    if args.save_model:
+
+                if step_idx % args.save_step == 0 and step_idx != 0:
+                    # validation
+                    if args.validation_file:
+                        transformer.eval()
+                        total_sum_cost = 0
+                        total_token_num = 0
+                        for input_data in val_loader():
+                            (src_word, src_pos, src_slf_attn_bias, trg_word,
+                             trg_pos, trg_slf_attn_bias, trg_src_attn_bias,
+                             lbl_word, lbl_weight) = input_data
+                            logits = transformer(src_word, src_pos,
+                                                 src_slf_attn_bias, trg_word,
+                                                 trg_pos, trg_slf_attn_bias,
+                                                 trg_src_attn_bias)
+                            sum_cost, avg_cost, token_num = criterion(
+                                logits, lbl_word, lbl_weight)
+                            total_sum_cost += sum_cost.numpy()
+                            total_token_num += token_num.numpy()
+                            total_avg_cost = total_sum_cost / total_token_num
+                        logging.info("validation, step_idx: %d, avg loss: %f, "
+                                     "normalized loss: %f, ppl: %f" %
+                                     (step_idx, total_avg_cost,
+                                      total_avg_cost - loss_normalizer,
+                                      np.exp([min(total_avg_cost, 100)])))
+                        transformer.train()
+
+                    if args.save_model and (
+                            trainer_count == 1
+                            or fluid.dygraph.parallel.Env().dev_id == 0):
                         model_dir = os.path.join(args.save_model,
                                                  "step_" + str(step_idx))
                         if not os.path.exists(model_dir):
@@ -186,6 +239,7 @@ def do_train(args):
                             os.path.join(model_dir, "transformer"))
 
                 batch_id += 1
+                total_batch_num = total_batch_num + 1
                 step_idx += 1
 
             time_consumed = time.time() - pass_start_time
diff --git a/dygraph/transformer/transformer.yaml b/dygraph/transformer/transformer.yaml
index 76151f207ca4b6b9a907779642f9f0f48cddc78b..15d9e783af2ab85ba6a9ba2800e8e3c6b6665ae6 100644
--- a/dygraph/transformer/transformer.yaml
+++ b/dygraph/transformer/transformer.yaml
@@ -19,6 +19,8 @@ inference_model_dir: "infer_model"
 random_seed: None
 # The pattern to match training data files.
 training_file: "wmt16_ende_data_bpe/train.tok.clean.bpe.32000.en-de"
+# The pattern to match validation data files.
+validation_file: "wmt16_ende_data_bpe/newstest2014.tok.bpe.32000.en-de"
 # The pattern to match test data files.
 predict_file: "wmt16_ende_data_bpe/newstest2016.tok.bpe.32000.en-de"
 # The file to output the translation results of predict_file to.